<img src='https://storage.googleapis.com/kaggle-competitions/kaggle/9999/logos/header.png?t=2018-06-28-21-19-41' />

# Store Item Demand Forecasting

**Description**

This competition is provided as a way to explore different time series techniques on a relatively simple and clean dataset.

You are given years of store-item sales data, and asked to predict 2015 sales for 4 different items at 4 different stores.

What's the best way to deal with seasonality? Should stores be modeled separately, or can you pool them together? Does deep learning work better than ARIMA? Can either beat xgboost?

**Evaluation**

Submissions are evaluated on [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error) between forecasts and actual values. We define [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error) = 0 when the actual and predicted values are both 0.

**Variables:**
- date
- store
- item
- sales

# Packages

In [None]:
# Base
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Model
import lightgbm as lgb
import shap
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer

# Configuration
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# Data

In [None]:
train = pd.read_csv('../input/resolvr-set/Train_data.csv', parse_dates=['date'])
test = pd.read_csv('../input/resolvr-set/Test_data.csv', parse_dates=['date'])
sample_sub = pd.read_csv('../input/resolvr-set/sample_submission.csv')
df = pd.concat([train, test], sort=False)

print(train.shape, test.shape, df.shape, "\n")
train.head()

# Exploratory Data Analysis

In [None]:
# How many stores and items are there?
train.store.nunique(), test.store.nunique(), train.item.nunique(), test.item.nunique()

In [None]:
# Time Range
train["date"].min(), train["date"].max(), test["date"].min(), test["date"].max()

In [None]:
# How many items are in the store?
df.groupby(["store"])["item"].nunique()

In [None]:
# Summary Stats for each store
df.groupby(["store"]).agg({"sales": ["count","sum", "mean", "median", "std", "min", "max"]})

In [None]:
# Summary Stats for each item
df.groupby(["item"]).agg({"sales": ["count","sum", "mean", "median", "std", "min", "max"]})

#### Histogram: Store Sales

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(60, 15))
for i in range(1,5):
    if i < 5:
        train[train.store == i].sales.hist(ax=axes[0, i-1])
        axes[0,i-1].set_title("Store " + str(i), fontsize = 15)
        
plt.tight_layout(pad=4.5)
plt.suptitle("Histogram: Sales");

#### Sales distribution for each item in the 1st store

In [None]:
store = 2
sub = train[train.store == store].set_index("date")

fig, axes = plt.subplots(4, 4, figsize=(60, 15))
for i in range(1,5):
    if i < 5:
        sub[sub.item == i].sales.plot(ax=axes[0, i-1], legend=True, label = "Item "+str(i)+" Sales")
plt.tight_layout(pad=4.5)
plt.suptitle("Store 2 Item Sales");

#### Correlation between total sales of stores

In [None]:
storesales = train.groupby(["date", "store"]).sales.sum().reset_index().set_index("date")
corr =  pd.pivot_table(storesales, values = "sales", columns="store", index="date").corr(method = "spearman")
plt.figure(figsize = (7,7))
sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.5)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 9}, square=True);

#### Hypothesis Testing

##### Stores

In [None]:
# T Test
def CompareTwoGroups(dataframe, group, target):
    
    import itertools
    from scipy.stats import shapiro
    import scipy.stats as stats
    
    # 1. Normality Test: Shapiro Test
    # 2. Homogeneity Test: Levene Test
    # 3. Parametric or Non-Parametric T Test: T-Test, Welch Test, Mann Whitney U
    
    # Create Combinations
    item_comb = list(itertools.combinations(dataframe[group].unique(), 2))
    
    AB = pd.DataFrame()
    for i in range(0, len(item_comb)):
        # Define Groups
        groupA = dataframe[dataframe[group] == item_comb[i][0]][target]
        groupB = dataframe[dataframe[group] == item_comb[i][1]][target]
        
        # Assumption: Normality
        ntA = shapiro(groupA)[1] < 0.05
        ntB = shapiro(groupB)[1] < 0.05
        # H0: Distribution is Normal! - False
        # H1: Distribution is not Normal! - True
        
        if (ntA == False) & (ntB == False): # "H0: Normal Distribution"
            # Parametric Test
            # Assumption: Homogeneity of variances
            leveneTest = stats.levene(groupA, groupB)[1] < 0.05
            # H0: Homogeneity: False
            # H1: Heterogeneous: True
            if leveneTest == False:
                # Homogeneity
                ttest = stats.ttest_ind(groupA, groupB, equal_var=True)[1]
                # H0: M1 = M2 - False
                # H1: M1 != M2 - True
            else:
                # Heterogeneous
                ttest = stats.ttest_ind(groupA, groupB, equal_var=False)[1]
                # H0: M1 = M2 - False
                # H1: M1 != M2 - True
        else:
            # Non-Parametric Test
            ttest = stats.mannwhitneyu(groupA, groupB)[1] 
            # H0: M1 = M2 - False
            # H1: M1 != M2 - True
            
        temp = pd.DataFrame({"Compare Two Groups":[ttest < 0.05], 
                             "p-value":[ttest],
                             "GroupA_Mean":[groupA.mean()], "GroupB_Mean":[groupB.mean()],
                             "GroupA_Median":[groupA.median()], "GroupB_Median":[groupB.median()],
                             "GroupA_Count":[groupA.count()], "GroupB_Count":[groupB.count()]
                            }, index = [item_comb[i]])
        temp["Compare Two Groups"] = np.where(temp["Compare Two Groups"] == True, "Different Groups", "Similar Groups")
        temp["TestType"] = np.where((ntA == False) & (ntB == False), "Parametric", "Non-Parametric")
        
        AB = pd.concat([AB, temp[["TestType", "Compare Two Groups", "p-value","GroupA_Median", "GroupB_Median","GroupA_Mean", "GroupB_Mean",
                                 "GroupA_Count", "GroupB_Count"]]])
        
    return AB
    
    
CompareTwoGroups(storesales, group = "store", target = "sales")

##### Items

In [None]:
itemsales = train.groupby(["date", "item"]).sales.sum().reset_index().set_index("date")
ctg_is = CompareTwoGroups(itemsales, group = "item", target = "sales")
ctg_is[ctg_is["Compare Two Groups"] == "Similar Groups"]

# Feature Engineering

- Time Related Features
- Lagged Features
- Moving Average Features
- Hypothesis Testing: Similarity Features
- Exponentially Weighted Mean Features

In [None]:
# 1. Time Related Features
#####################################################
def create_date_features(df):
    df['month'] = df.date.dt.month
    df['day_of_month'] = df.date.dt.day
    df['day_of_year'] = df.date.dt.dayofyear
    df['week_of_year'] = df.date.dt.weekofyear
    df['day_of_week'] = df.date.dt.dayofweek + 1
    df['year'] = df.date.dt.year
    df["is_wknd"] = df.date.dt.weekday // 4
    df["quarter"] = df.date.dt.quarter
    df['is_month_start'] = df.date.dt.is_month_start.astype(int)
    df['is_month_end'] = df.date.dt.is_month_end.astype(int)
    df['is_quarter_start'] = df.date.dt.is_quarter_start.astype(int)
    df['is_quarter_end'] = df.date.dt.is_quarter_end.astype(int)
    df['is_year_start'] = df.date.dt.is_year_start.astype(int)
    df['is_year_end'] = df.date.dt.is_year_end.astype(int)
    # 0: Winter - 1: Spring - 2: Summer - 3: Fall
    df["season"] = np.where(df.month.isin([12,1,2]), 0, 1)
    df["season"] = np.where(df.month.isin([6,7,8]), 2, df["season"])
    df["season"] = np.where(df.month.isin([9, 10, 11]), 3, df["season"])
    return df
df = create_date_features(df)


# Rolling Summary Stats Features
#####################################################
for i in [2,5,10,20,35,50,70,91, 98, 105, 112, 119, 126, 186, 200, 210, 250, 300, 365]:
    df["sales_roll_mean_"+str(i)]=df.groupby(["store", "item"]).sales.rolling(i).mean().shift(1).values
    #df["sales_roll_std_"+str(i)]= df.groupby(["store", "item"]).sales.rolling(i).std().shift(1).values
    #df["sales_roll_max_"+str(i)]= df.groupby(["store", "item"]).sales.rolling(i).max().shift(1).values
    #df["sales_roll_min_"+str(i)]= df.groupby(["store", "item"]).sales.rolling(i).min().shift(1).values


# 2. Hypothesis Testing: Similarity
#####################################################

# Store Based
storesales = train.groupby(["date", "store"]).sales.sum().reset_index()
ctg_ss = CompareTwoGroups(storesales, group="store", target="sales")
del storesales

df["StoreSalesSimilarity"] = np.where(df.store.isin([3,10]), 1, 0)
df["StoreSalesSimilarity"] = np.where(df.store.isin([4,9]), 2, df["StoreSalesSimilarity"])
df["StoreSalesSimilarity"] = np.where(df.store.isin([5,6]), 3, df["StoreSalesSimilarity"])



# 3. Lag/Shifted Features
#####################################################

# test.groupby(["store", "item"]).date.count()
# Test verisinde +90 gün tahmin edilmesi isteniyor bu yüzden
# Lag featureları en az 91 olmalı!

df.sort_values(by=['store', 'item', 'date'], axis=0, inplace=True)

def lag_features(dataframe, lags, groups = ["store", "item"], target = "sales", prefix = ''):
    dataframe = dataframe.copy()
    for lag in lags:
        dataframe[prefix + str(lag)] = dataframe.groupby(groups)[target].transform(
            lambda x: x.shift(lag))
    return dataframe

df = lag_features(df, lags = [2,5,10,25,50,70,91,92,93,94,95,96, 97, 98, 100, 105, 112, 119, 126, 150,
                              182,200,220, 250, 300, 350, 355, 360,361,362,363, 364,
                              365, 370, 375,380, 546, 600],
                  groups = ["store", "item"], target = 'sales', prefix = 'sales_lag_')

def drop_cor(dataframe, name, index):
    ind = dataframe[dataframe.columns[dataframe.columns.str.contains(name)].tolist()+["sales"]].corr().sales.sort_values(ascending = False).index[1:index]
    ind = dataframe.drop(ind, axis = 1).columns[dataframe.drop(ind, axis = 1).columns.str.contains(name)]
    dataframe.drop(ind, axis = 1, inplace = True)

drop_cor(df, "sales_lag", 16)


# 4. Last i. Months
#####################################################
df["monthyear"] = df.date.dt.to_period('M')

# Store-Item Based
for i in [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36]:
    last_months = df.groupby(["store", "item", "monthyear"]).sales.agg([
        "sum", "mean", "std", "min", "max"]).shift(i).reset_index()
    last_months.columns = ['store', 'item', 'monthyear', 'last_'+str(i)+'months_sales_sum',
                           'last_'+str(i)+'months_sales_mean', 'last_'+str(i)+'months_sales_std',
                           'last_'+str(i)+'months_sales_min', 'last_'+str(i)+'months_sales_max']
    df = pd.merge(df, last_months, how   = "left", on = ["store", "item", "monthyear"])
del last_months, i

drop_cor(df, "last_", 15)

# Store Based


for i in [3, 6, 9, 12]:
    last_months = df.groupby(["store", "monthyear"]).sales.agg([
        "sum", "mean", "std", "min", "max"]).shift(i).reset_index()
    last_months.columns = ['store', 'monthyear', 'store_last_'+str(i)+'months_sales_sum',
                           'store_last_'+str(i)+'months_sales_mean', 'store_last_'+str(i)+'months_sales_std',
                           'store_last_'+str(i)+'months_sales_min', 'store_last_'+str(i)+'months_sales_max']
    df = pd.merge(df, last_months, how = "left", on = ["store", "monthyear"])
del last_months, i

# Item Based
for i in [3, 6, 9, 12]:
    last_months = df.groupby(["item", "monthyear"]).sales.agg([
        "sum", "mean", "std", "min", "max"]).shift(i).reset_index()
    last_months.columns = ['item', 'monthyear', 'item_last_'+str(i)+'months_sales_sum',
                           'item_last_'+str(i)+'months_sales_mean', 'item_last_'+str(i)+'months_sales_std',
                           'item_last_'+str(i)+'months_sales_min', 'item_last_'+str(i)+'months_sales_max']
    df = pd.merge(df, last_months, how = "left", on = ["item", "monthyear"])
del last_months, i

# Similarity Based


for i in [3, 6, 9, 12]:
    last_months = df.groupby(["StoreSalesSimilarity", "monthyear"]).sales.agg([
        "sum", "mean", "std", "min", "max"]).shift(i).reset_index()
    last_months.columns = ['StoreSalesSimilarity', 'monthyear', 'storesim_last_'+str(i)+'months_sales_sum',
                           'storesim_last_'+str(i)+'months_sales_mean', 'storesim_last_'+str(i)+'months_sales_std',
                           'storesim_last_'+str(i)+'months_sales_min', 'storesim_last_'+str(i)+'months_sales_max']
    df = pd.merge(df, last_months, how = "left", on = ["StoreSalesSimilarity", "monthyear"])
del last_months, i


df.drop("monthyear", axis = 1, inplace = True)


# 5. Last i. day of week
#####################################################
df.sort_values(["store", "item", "day_of_week", "date"], inplace = True)

df = lag_features(df, lags = np.arange(12,41, 1).tolist()+[91, 92, 95, 98, 99, 100, 105, 112, 119, 126, 133, 140, 200, 205, 210, 215, 220, 250],
                  groups = ["store", "item", "day_of_week"], target = 'sales', prefix = 'dayofweek_sales_lag_')

df[df.columns[df.columns.str.contains("dayofweek_sales_lag_")].tolist()+["sales"]].corr().sales.sort_values(ascending = False)

drop_cor(df, "dayofweek_sales_lag_", 16)

df.sort_values(["store", "item", "date"], inplace = True)


#####################################################
# Exponentially Weighted Mean Features
#####################################################
def ewm_features(dataframe, alphas, lags):
    dataframe = dataframe.copy()
    for alpha in alphas:
        for lag in lags:
            dataframe['sales_ewm_alpha_' + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
                dataframe.groupby(["store", "item"])['sales']. \
                    transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
    return dataframe

alphas = [0.95, 0.9, 0.8, 0.7, 0.5]
lags = [2,5,10,20,30,50,70,91, 98, 105, 112, 180, 270, 365, 546, 728]

df = ewm_features(df, alphas, lags)

# Day of year 
df.sort_values(["day_of_year", "store", "item"], inplace = True)
df = lag_features(df, lags = [1,2,3,4],
                  groups = ["day_of_year", "store", "item"], target = 'sales', prefix = 'dayofyear_sales_lag_')


# pd.cut
clus = df.groupby(["store"]).sales.mean().reset_index()
clus["store_cluster"] =  pd.cut(clus.sales, bins = 4, labels = range(1,5))
clus.drop("sales", axis = 1, inplace = True)
df = pd.merge(df, clus, how = "left")
clus = df.groupby(["item"]).sales.mean().reset_index()
clus["item_cluster"] =  pd.cut(clus.sales, bins = 5, labels = range(1,6))
clus.drop("sales", axis = 1, inplace = True)
df = pd.merge(df, clus, how = "left")
del clus

df.shape

# Train-Validation Split

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split



In [None]:
df

In [None]:
df = df.sort_values("date").reset_index(drop = True)


In [None]:
df1 = df.loc[(df["date"] <= "2014-12-26"), :]

In [None]:
#X = df1.loc[:,~(df1.columns.str.contains("sales")|(df1.columns.str.contains("id"))|(df1.columns.str.contains("date")))]

In [None]:
X = df1.loc[:,~(df1.columns.str.contains("sales")|(df1.columns.str.contains("id"))|(df1.columns.str.contains("date")))]

In [None]:
Y= df1['sales']

In [None]:
val= X.loc[:,~(X.columns.str.contains("date"))]

In [None]:
X_train, X_val,Y_train, Y_val = train_test_split(X,Y,test_size= 0.1, random_state=42 )

In [None]:
X_val 

In [None]:
Y_train.shape, X_train.shape, Y_val.shape, X_val.shape

# Custom Cost Function

If you want to see the SMAPE formula, click [here](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error).

In [None]:
# SMAPE: Symmetric mean absolute percentage error (adjusted MAPE)
def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds-target)
    denom = np.abs(preds)+np.abs(target)
    smape_val = (200*np.sum(num/denom))/n
    return smape_val

def lgbm_smape(y_true, y_pred):
    smape_val = smape(y_true, y_pred)
    return 'SMAPE', smape_val, False


# First Model

### Default Parameters

In [None]:
first_model = lgb.LGBMRegressor(random_state=384).fit(X_train, Y_train, 
                                                      eval_metric= lambda y_true, y_pred: [lgbm_smape(y_true, y_pred)])

print("TRAIN SMAPE:", smape(Y_train, first_model.predict(X_train)))
print("VALID SMAPE:", smape(Y_val, first_model.predict(X_val)))

# Feature Importance

In [None]:

def plot_lgb_importances(model, plot=False, num=10):
    from matplotlib import pyplot as plt
    import seaborn as sns
    
    # LGBM API
    #gain = model.feature_importance('gain')
    #feat_imp = pd.DataFrame({'feature': model.feature_name(),
    #                         'split': model.feature_importance('split'),
    #                         'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
    
    # SKLEARN API
    gain = model.booster_.feature_importance(importance_type='gain')
    feat_imp = pd.DataFrame({'feature': model.feature_name_,
                             'split': model.booster_.feature_importance(importance_type='split'),
                             'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
    if plot:
        plt.figure(figsize=(10, 10))
        sns.set(font_scale=1)
        sns.barplot(x="gain", y="feature", data=feat_imp[0:25])
        plt.title('feature')
        plt.tight_layout()
        plt.show()
    else:
        print(feat_imp.head(num))
        return feat_imp

feature_imp_df = plot_lgb_importances(first_model, num=20)

In [None]:
feature_imp_df.shape, feature_imp_df[feature_imp_df.gain > 0].shape, feature_imp_df[feature_imp_df.gain > 0.57].shape

In [None]:
plot_lgb_importances(first_model, plot=True, num=20)

# Shap

In [None]:
explainer = shap.Explainer(first_model)
shap_values_train = explainer(X_train)
shap_values_valid = explainer(X_val)

len(shap_values_train), len(shap_values_valid)

In [None]:
# summarize the effects of all the features
shap.plots.beeswarm(shap_values_train, max_display=30)

In [None]:
# summarize the effects of all the features
shap.plots.beeswarm(shap_values_valid, max_display=30)

In [None]:
shap.plots.bar(shap_values_train, max_display=30)

# Error Analysis

In [None]:
val = X_val.copy()

In [None]:
val.dtypes

In [None]:
date_proxy = pd.Series(val.year.apply(str) + '-' + val.week_of_year.apply(str))

In [None]:
import datetime
def convert_to_date(x):
    answer = datetime.datetime.strptime(x + '-1', "%Y-%W-%w")
    return(answer)

In [None]:
val = val.reset_index(drop=True)

In [None]:
val['date_proxy'] = date_proxy.apply(lambda x:convert_to_date(x)).reset_index(drop=True)

In [None]:
X_val = X_val.reset_index(drop=True)
Y_val = Y_val.reset_index(drop=True)

In [None]:
error = pd.DataFrame({
    "date":val.date_proxy,
    "store":X_val.store,
    "item":X_val.item,
    "actual":Y_val,
    "pred":first_model.predict(X_val)
}).reset_index(drop = True)

error["error"] = np.abs(error.actual-error.pred)

error.sort_values("error", ascending=False).head(20)

In [None]:
error[["actual", "pred", "error"]].describe([0.7, 0.8, 0.9, 0.95, 0.99]).T

In [None]:
# Mean Absolute Error
error.groupby(["store", "item"]).error.mean().sort_values(ascending = False)

In [None]:
# Mean Absolute Error
error.groupby(["store"]).error.mean().sort_values(ascending = False)

In [None]:
# Mean Absolute Error
error.groupby(["item"]).error.mean().sort_values(ascending = False)

#### Store 1 Validation Set: Actual & Pred

In [None]:
# Store 1 Actual - Pred
sub = error[error.store == 1].set_index("date")
fig, axes = plt.subplots(2, 4, figsize=(60, 15))
for i in range(1,5):
    if i < 5:
        sub[sub.item == i].actual.plot(ax=axes[0, i-1], legend=True, label = "Item "+str(i)+" Sales")
        sub[sub.item == i].pred.plot(ax=axes[0, i - 1], legend=True, label="Item " + str(i) + " Pred", linestyle = "dashed")
plt.tight_layout(pad=4.5)
plt.suptitle("Store 1 Item Satış Dağılımı");
plt.show()

In [None]:
fig, axes = plt.subplots(4, 2, figsize = (20,20))
for axi in axes.flat:
    axi.ticklabel_format(style="sci", axis="y", scilimits=(0,10))
    axi.ticklabel_format(style="sci", axis="x", scilimits=(0,10))
    axi.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
    axi.get_xaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
    
(error.actual-error.pred).hist(ax = axes[0, 0], color = "steelblue", bins = 20)
error.error.hist(ax = axes[0,1], color = "steelblue", bins = 20)
sr = error.copy()
sr["StandardizedR"] = (sr.error / (sr.actual-sr.pred).std())
sr["StandardizedR2"] = ((sr.error / (sr.actual-sr.pred).std())**2)
sr.plot.scatter(x = "pred",y = "StandardizedR", color = "red", ax = axes[1,0])
sr.plot.scatter(x = "pred",y = "StandardizedR2", color = "red", ax = axes[1,1])
error.actual.hist(ax = axes[2, 0], color = "purple", bins = 20)
error.pred.hist(ax = axes[2, 1], color = "purple", bins = 20)
error.plot.scatter(x = "actual",y = "pred", color = "seagreen", ax = axes[3,0]);
# QQ Plot
import statsmodels.api as sm
import pylab
sm.qqplot(sr.pred, ax = axes[3,1], c = "seagreen")
plt.suptitle("ERROR ANALYSIS", fontsize = 20)
axes[0,0].set_title("Error Histogram", fontsize = 15)
axes[0,1].set_title("Absolute Error Histogram", fontsize = 15)
axes[1,0].set_title("Standardized Residuals & Fitted Values", fontsize = 15)
axes[1,1].set_title("Standardized Residuals^2 & Fitted Values", fontsize = 15)
axes[2,0].set_title("Actual Histogram", fontsize = 15)
axes[2,1].set_title("Pred Histogram", fontsize = 15);
axes[3,0].set_title("Actual Pred Relationship", fontsize = 15);
axes[3,1].set_title("QQ Plot", fontsize = 15);
axes[1,0].set_xlabel("Fitted Values (Pred)", fontsize = 12)
axes[1,1].set_xlabel("Fitted Values (Pred)", fontsize = 12)
axes[3,0].set_xlabel("Actual", fontsize = 12)
axes[1,0].set_ylabel("Standardized Residuals", fontsize = 12)
axes[1,1].set_ylabel("Standardized Residuals^2", fontsize = 12)
axes[3,0].set_ylabel("Pred", fontsize = 12)
fig.tight_layout(pad=3.0)
plt.savefig("errors.png")
plt.show()

# Next Model

### Default Parameters & Feature Selection with LGBM Feature Importance

In [None]:
# First model feature importance
cols = feature_imp_df[feature_imp_df.gain > 0.015].feature.tolist()
print("Independent Variables:", len(cols))

second_model = lgb.LGBMRegressor(random_state=384).fit(
    X_train[cols], Y_train, 
    eval_metric= lambda y_true, y_pred: [lgbm_smape(y_true, y_pred)])

print("TRAIN SMAPE:", smape(Y_train, second_model.predict(X_train[cols])))
print("VALID SMAPE:", smape(Y_val, second_model.predict(X_val[cols])))

**First Model Scores**

- TRAIN SMAPE: 7.681385434152381
-VALID SMAPE: 14.713719362343234

**Second Model Scores**
- TRAIN SMAPE: 7.7084
- VALID SMAPE: 14.531
- 164 Features

# Hyperparameter Tuning

There are two steps to tune LGBM models!

- **1st Optimization:** Finding other parameters when the number of iterations is constant (GridSearchedCV, RandomSearchedCV etc.)
- **2nd Optimization:** Finding best iteration number by using early stopping round

**Hyperparameter tuning takes too long because of high iteration number and data dimension, that's why I add the Random Search CV algorithm below as text.**

In [None]:
###First Optimization: Hyperparameter Tuning with Random Searched
lgbm_params = {

"num_leaves":[20,31], # Default 31
"max_depth":[-1, 20, 30], # Default -1
"learning_rate":[0.1, 0.05], # Default 0.1
"n_estimators":[10000,15000], # Default 100
"min_split_gain":[0.0, 2,5], # Default 0
"min_child_samples":[10, 20, 30], # Default 20
"colsample_bytree":[0.5, 0.8, 1.0], # Default 1
"reg_alpha":[0.0, 0.5, 1], # Default 0
"reg_lambda":[0.0, 0.5, 1] # Default 0
}

model = lgb.LGBMRegressor(random_state=384)

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer
tscv = TimeSeriesSplit(n_splits=3)

rsearch = RandomizedSearchCV(model, lgbm_params, random_state=384, cv=tscv, scoring=make_scorer(smape), verbose = True, n_jobs = -1).fit( X_train[cols], Y_train )

print(rsearch.best_params_)


#### First Optimization: Hyperparameter Tuning with Random Searched
lgbm_params = {
    
    "num_leaves":[20,31], # Default 31
    "max_depth":[-1, 20, 30], # Default -1
    "learning_rate":[0.1, 0.05], # Default 0.1
    "n_estimators":[10000,15000], # Default 100
    "min_split_gain":[0.0, 2,5], # Default 0
    "min_child_samples":[10, 20, 30], # Default 20
    "colsample_bytree":[0.5, 0.8, 1.0], # Default 1
    "reg_alpha":[0.0, 0.5, 1], # Default 0
    "reg_lambda":[0.0, 0.5, 1] # Default 0
}

model = lgb.LGBMRegressor(random_state=384)

- from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
- from sklearn.metrics import make_scorer

tscv = TimeSeriesSplit(n_splits=3)

rsearch = RandomizedSearchCV(model, lgbm_params, random_state=384, 
                             cv=tscv, scoring=make_scorer(smape),
                             verbose = True, n_jobs = -1).fit(
    X_train[cols], Y_train
)

print(rsearch.best_params_)

{'num_leaves': 31, 'n_estimators': 15000, 'max_depth': 20}

**When Random Searched CV finished running, it gives us best parameters in sample paramater space. Then, we should train a new model with best parameters and evaluate model performance.**

model_tuned = lgb.LGBMRegressor(**rsearch.best_params_, random_state=384).fit(X_train[cols], Y_train)

print("TRAIN SMAPE:", smape(Y_train, model_tuned.predict(X_train[cols])))

print("VALID SMAPE:", smape(Y_val, model_tuned.predict(X_val[cols])))

- TRAIN SMAPE: 9.424761658139554
- VALID SMAPE: 12.784089823495902

In [None]:
model_tuned = lgb.LGBMRegressor(**rsearch.best_params_, random_state=384).fit(X_train[cols], Y_train)

print("TRAIN SMAPE:", smape(Y_train, model_tuned.predict(X_train[cols])))

print("VALID SMAPE:", smape(Y_val, model_tuned.predict(X_val[cols])))



**First Model Scores**

-First Model Scores

TRAIN SMAPE: 7.681385434152381
VALID SMAPE: 14.713719362343234
Second Model Scores

TRAIN SMAPE: 7.7084
VALID SMAPE: 14.531
164 Features
add Code


**Third Model Scores**

-TRAIN SMAPE: 0.031440652293170154
VALID SMAPE: 14.46012994194726

#### Second Optimization: Find best iteration number

In [None]:
# Best Params: {'num_leaves': 31, 'n_estimators': 15000, 'max_depth': 20}
# model_tuned2 = lgb.LGBMRegressor(**rsearch.best_params_, random_state=384, metric = "custom")

model_tuned2 = lgb.LGBMRegressor(num_leaves=20, n_estimators=15000, max_depth=-1, random_state=384, metric = "custom")
              
model_tuned2.fit(
    X_train[cols], Y_train,
    eval_metric= lambda y_true, y_pred: [lgbm_smape(y_true, y_pred)],
    eval_set = [(X_train[cols], Y_train), (X_val[cols], Y_val)],
    eval_names = ["Train", "Valid"],
    early_stopping_rounds= 1000, verbose = 500
)
print("Best Iteration:", model_tuned2.booster_.best_iteration)

First Model Scores

-First Model Scores

TRAIN SMAPE: 7.681385434152381 VALID SMAPE: 14.713719362343234 Second Model Scores

TRAIN SMAPE: 7.7084 VALID SMAPE: 14.531 164 Features add Code

Third Model Scores

-TRAIN SMAPE: 0.031440652293170154 VALID SMAPE: 14.46012994194726

	Train's SMAPE: 10.4091	Valid's SMAPE: 14.369




# Final Model

In [None]:
df.sort_values(["store", "item", "date"], inplace = True)

train_final = df.loc[(df["date"] <= "2014-12-26"), :]
test_final = df.loc[(df["date"] > "2014-12-26"), :]

X_train_final = train_final[cols]
Y_train_final = train_final.sales
X_test_final = test_final[cols]


#final_model = lgb.LGBMRegressor(**rsearch.best_params_, random_state=384, metric = "custom") # Tuned parameters
# Best Params: {'num_leaves': 31, 'n_estimators': 15000, 'max_depth': 20}
final_model = lgb.LGBMRegressor(num_leaves=20, n_estimators=15000, max_depth=-1, random_state=384, metric = "custom")
final_model.set_params(n_estimators=model_tuned2.booster_.best_iteration) # Best Iteration: 983
final_model.fit(X_train_final[cols], Y_train_final,
                eval_metric= lambda y_true, y_pred: [lgbm_smape(y_true, y_pred)])

# Submission

In [None]:
submission = pd.DataFrame({
    "id":test_final.id.astype(int),
    "sales":final_model.predict(X_test_final)
})
submission.to_csv("submission1.csv", index = None)

In [None]:
submission[["sales"]].describe([0.1, 0.75, 0.8, 0.9, 0.95, 0.99]).T

In [None]:
submission.sales.hist(color = "g");

In [None]:
forecast = pd.DataFrame({
    "date":test_final.date,
    "store":test_final.store,
    "item":test_final.item,
    "sales":final_model.predict(X_test_final)
})

forecast[(forecast.store == 2) & (forecast.item == 2)].set_index("date").sales.plot(color = "orange", figsize = (20,9),legend=True, label = "Store 1 Item 1 Forecast");

In [None]:
train_final[(train_final.store == 3) & (train_final.item == 3)].set_index("date").sales.plot(figsize = (40,9),legend=True, label = "Store 3 Item 3 Sales")
forecast[(forecast.store == 3) & (forecast.item == 3 )].set_index("date").sales.plot(legend=True, label = "Store 3 Item 3 Forecast");

In [None]:
store = 3
sub = train[train.store == store].set_index("date")
forc = forecast[forecast.store == store].set_index("date")


fig, axes = plt.subplots(2, 4, figsize=(60, 15))
for i in range(1,5):
    if i < 5:
        sub[sub.item == i].sales.plot(ax=axes[0, i-1], legend=True, label = "Item "+str(i)+" Sales")
        forc[forc.item == i].sales.plot(ax=axes[0, i-1], legend=True, label = "Forecast")
plt.tight_layout(pad=6.5)
plt.suptitle("Store 1 Items Actual & Forecast");