In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
############################################################################################
#    STORE ITEM DEMAND FORCASTING KAGGLE CHALLENGE
#    Serkan KAYA , 22 February 2022
############################################################################################

# Business Problem:
#
# A chain of stores asks for a 3-month estimate for 50 different products distributed in
# 10 different stores in different locations.

#
############################################################################################
#   History of Dataset
############################################################################################
# This dataset is presented to test different time series techniques.
#
# A store chain's 5-year data includes information on 10 different stores for 50 different
# products.
#
############################################################################################
#   Variables
############################################################################################
# date – History of the sales data. There are no holiday effects or store closures.
# Store – Store ID. Unique number for each store.
# Item – Product ID. Unique number for each product.
# Sales – Number of items sold. The number of products sold from a particular store on a given date.
#
###########################################################################################
#   PROJECT TASKS
############################################################################################
#
# Create a 3-month demand forecasting model for the relevant store chain using the following
# time series and machine learning techniques:
#
# ▪ Random Noise
# ▪ Lag/Shifted Features
# ▪ Rolling Mean Features
# ▪ Exponentially Weighted Mean Features ▪ Custom Cost Function (SMAPE)
# ▪ Model validation through LightGBM
#
############################################################################################

In [None]:
import time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import lightgbm as lgb
import warnings

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
warnings.filterwarnings('ignore')


In [None]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)



In [None]:
########################
# Loading the data
########################

train = pd.read_csv('../input/demand-forecasting-kernels-only/train.csv', parse_dates=['date'])
test = pd.read_csv('../input/demand-forecasting-kernels-only/test.csv', parse_dates=['date'])
sample_sub = pd.read_csv('../input/demand-forecasting-kernels-only/sample_submission.csv')
train.shape, test.shape, sample_sub.shape # (913000,4) (45000,4) (45000,2)

In [None]:
df = pd.concat([train, test], sort=False)
df.head()


In [None]:
#####################################################
# EDA
#####################################################

df['date'].max()
df['date'].min()
check_df(train)

In [None]:
check_df(test)

In [None]:
check_df(df)

In [None]:
df['sales'].describe([0.10, 0.30, 0.50, 0.70, 0.80, 0.90, 0.95, 0.99])

In [None]:

# How many stores are there ?
df[['store']].nunique()


In [None]:
# How many items are there ?
df[['item']].nunique()

In [None]:
# Are there same nbr of items in each store?
df.groupby('store').agg({'item':'nunique'})

In [None]:
# Are there same nbr of sales in each store?
df.groupby(['store','item']).agg({'sales':'sum'})

In [None]:
# What are the descriptive statistics with store and item breakdown?
df.groupby(["store", "item"]).agg({"sales": ["sum", "mean", "median", "std"]})

In [None]:
#####################################################
# FEATURE ENGINEERING
#####################################################

df.head()  # Totally we have only 4 variables. We need to generate much more variables in Feature Engineering.

In [None]:
########################
# Date Features
########################

def create_date_features(df):
    df['month'] = df.date.dt.month
    df['day_of_month'] = df.date.dt.day
    df['day_of_year'] = df.date.dt.dayofyear
    df['week_of_year'] = df.date.dt.weekofyear
    df['day_of_week'] = df.date.dt.dayofweek
    df['year'] = df.date.dt.year
    df['is_wknd'] = df.date.dt.weekday // 4  # Also included Friday as weekend
    df['is_month_start'] = df.date.dt.is_month_start.astype(int)
    df['is_month_end'] = df.date.dt.is_month_end.astype(int)
    df['quarter'] = df.date.dt.quarter
    df['is_quarter_start'] = df.date.dt.is_quarter_start.astype(int)
    df['is_quarter_end'] = df.date.dt.is_quarter_end.astype(int)
    df['is_christmas_week'] = (df.date.dt.weekofyear == 51).astype(int)
    return df


In [None]:
df = create_date_features(df)
df.head()

In [None]:
df.groupby(["store", "item","month"]).agg({"sales": ["sum", "mean", "median", "std"]})

In [None]:
########################
# Random Noise
########################

def random_noise(dataframe):
    return np.random.normal(scale=1.6, size=(len(dataframe),))


In [None]:
########################
# Lag/Shifted Features
########################
# We are going to use this to catch up the seasonality in the TS

def lag_features(dataframe, lags):
    for lag in lags:
        dataframe['sales_lag_' + str(lag)] = dataframe.groupby(['store', 'item'])['sales'].transform(
            lambda x: x.shift(lag)) + random_noise(dataframe)
    return dataframe

df = lag_features(df, [16, 23, 31, 38, 61, 68, 91, 98, 105, 112, 119, 126, 182, 211 ,364, 546, 728])

In [None]:
df.head()
df.shape # (958000, 35)

In [None]:
########################
# Rolling Mean (Moving Average) Features
########################
# We are going to use this to catch up the trend in the TS

def roll_mean_features(dataframe, windows):
    for window in windows:
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(['store','item'])['sales'].\
                                                          transform(
            lambda x : x.shift(1).rolling(window=window, min_periods=10, win_type='triang').mean())+random_noise(dataframe)
    return dataframe

df = roll_mean_features(df, [16, 23, 32, 62, 68, 98, 105, 126, 182, 211, 365, 546, 728])

In [None]:
df.shape # (958000, 48)

In [None]:
########################
# Exponentially Weighted Mean Features (Ussel agirlikli ortalamalar)
########################


def ewm_features(dataframe, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            dataframe['sales_ewm_alpha' + str(alpha).replace('.','') + '_lag_' + str(lag)] = \
            dataframe.groupby(['store','item'])['sales'].transform(
                lambda x: x.shift(lag).ewm(alpha=alpha).mean())
    return dataframe

alphas = [0.95, 0.9, 0.8, 0.7, 0.5, 0.4, 0.3]
lags = [23, 31, 62, 91, 98, 105, 112, 126, 180, 270, 365, 546, 728]

In [None]:
df = ewm_features(df, alphas, lags)

In [None]:
df.shape # (958000, 139)

In [None]:
########################
# One-Hot Encoding
########################

df = pd.get_dummies(df, columns=['store','item','day_of_week','month'])

df.shape # (958000, 214)

In [None]:
########################
# Converting sales to log(1+sales)
########################

df['sales'] = np.log1p(df['sales'].values)
df.head()

In [None]:
#####################################################
# Building up the Model
#####################################################

In [None]:
########################
# Custom Cost Function
########################

def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds - target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = (200 * np.sum(num / denom)) / n
    return smape_val


def lgbm_smape(preds, train_data):
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))  # expm1 ters logaritma daha once log yapmistik hatirlarsak
    return 'SMAPE', smape_val, False

In [None]:
########################
# Time-Based Validation Sets
########################

train['date'].min(), train['date'].max()
test['date'].min(), test['date'].max()

In [None]:
train = df.loc[(df['date']<'2017-01-01'), :]
val = df.loc[(df['date']>='2017-01-01') & (df['date']<'2017-04-01'), :]  # Validation set

cols = [col for col in train.columns if col not in ['date','id','sales','year']]
# We do not need anymore ['date','id','sales','year'] as we already generated new features from them

In [None]:
Y_train = train['sales']
X_train = train[cols]

Y_val = val['sales']
X_val = val[cols]

Y_train.shape, X_train.shape, Y_val.shape, X_val.shape # ((730500,), (730500, 210), (45000,), (45000, 210))


In [None]:
########################
# LightGBM Model
########################

# LightGBM parameters

lgb_params = {'num_leaves':10,
              'learning_rate': 0.02,
              'feature_fraction': 0.8,
              'max_depth': 5,
              'verbose': 0,
              'num_boost_round': 20000,
              'early_stopping_rounds': 200,
              'nthread': -1}

In [None]:
lgbtrain = lgb.Dataset(data = X_train, label = Y_train, feature_name=cols)
lgbval = lgb.Dataset(data = X_val, label = Y_val, reference=lgbtrain, feature_name=cols)

In [None]:
model = lgb.train(lgb_params, lgbtrain,
                  valid_sets = [lgbtrain, lgbval],
                  num_boost_round = lgb_params['num_boost_round'],
                  early_stopping_rounds = lgb_params['early_stopping_rounds'],
                  feval = lgbm_smape,
                  verbose_eval = 100)

In [None]:
# Our Training and Validation Percentage Errors
# [6681]	training's SMAPE: 12.7643%	 valid_1's SMAPE: 13.383 %

In [None]:
# We can calculate our error by ourselves
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

smape(np.expm1(y_pred_val), np.expm1(Y_val)) # Our prediction error: 13.383 %

In [None]:
########################
# Feature importances
########################

In [None]:
def plot_lgb_importances(model, plot=False, num=10):

    gain = model.feature_importance('gain')
    feat_imp = pd.DataFrame({'feature': model.feature_name(),
                             'split': model.feature_importance('split'),
                             'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
    if plot:
        plt.figure(figsize=(10, 10))
        sns.set(font_scale=1)
        sns.barplot(x="gain", y="feature", data=feat_imp[0:25])
        plt.title('feature')
        plt.tight_layout()
        plt.show()
    else:
        print(feat_imp.head(num))


plot_lgb_importances(model, num=30)

In [None]:
plot_lgb_importances(model, plot=True, num=30)

In [None]:
########################
# Final Model
########################

In [None]:
train = df.loc[~df.sales.isna()]  

Y_train = train['sales']
X_train = train[cols]

test = df.loc[df.sales.isna()]
X_test = test[cols]

In [None]:
lgb_params = {'metric':{'mae'},
              'num_leaves':10,
              'learning_rate': 0.02,
              'feature_fraction': 0.8,
              'max_depth': 5,
              'verbose': 0,
              'num_boost_round': model.best_iteration,
              'nthread': -1}

In [None]:
lgbtrain_all = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)

In [None]:
model = lgb.train(lgb_params, lgbtrain_all, num_boost_round=model.best_iteration) 

In [None]:
test_preds = model.predict(X_test, num_iteration=model.best_iteration)

In [None]:
submission_df = test.loc[:, ['id','sales']]
submission_df['sales'] = np.expm1(test_preds)
submission_df['id'] = submission_df.id.astype(int)
submission_df.to_csv('submission.csv', index=False) # Our output submission file for the challenge

In [None]:
submission_df.head(20)