# Starting Notebook for Time Series Forecasting

The time series of this competitions is similar to the M5 competition (https://www.kaggle.com/c/m5-forecasting-accuracy) held on kaggle few months ago. This notebook aims at introducing a relatively simple baseline solution using a deterministic process along with LightGBM (with Optuna for hyperparameters tuning). 

Note that this notebook will only use a deterministic process to infer the amount of sales. This means that we do not care about oil prices/promotions/holydays (except for NYE). Additionally, we will train a multioutput regressor, so we will neglect the interactions between various products in the same store. Again, this aims at being a baseline solutions, for a more accurate prediction I suggest to follow the https://github.com/Mcompetitions/M5-methods solutions, that fully exploit the properties of this dataset.  

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import optuna
from pickle import dump, load
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

## Load the train and test dataset

We will only use the train and test dataset. Since this is a baseline solution, we will just need the deterministic process to make predictions

In [None]:
path = '/kaggle/input/store-sales-time-series-forecasting/'
train = pd.read_csv(
                f'{path}train.csv',
                usecols=['store_nbr', 'family', 'date', 'sales'],
                dtype={
                    'store_nbr': 'category',
                    'family': 'category',
                    'sales': 'float32',
                },
                parse_dates=['date'],
                infer_datetime_format=True,
)

train['date'] = train.date.dt.to_period('D')
train = train.set_index(['store_nbr', 'family', 'date']).sort_index()

y = train.unstack(['store_nbr', 'family']).loc["2017"]
y.head()

In [None]:
test = pd.read_csv(f'{path}/test.csv',
                    dtype={
                        'store_nbr': 'category',
                        'family': 'category',
                        'onpromotion': 'uint32',
                    },
                    parse_dates=['date'],
                    infer_datetime_format=True,
)
test['date'] = test.date.dt.to_period('D')
test = test.set_index(['store_nbr', 'family', 'date']).sort_index()

test.head()

In [None]:
# Create training data
fourier = CalendarFourier(freq='M', order=4)
dp = DeterministicProcess(
    index=y.index,
    order=1,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)
X = dp.in_sample()
X['NewYear'] = (X.index.dayofyear == 1)
X.head()

In [None]:
def optimize_model(trial):
    
    params = {
        'random_state' : 42,
        'n_estimators' : trial.suggest_int('n_estimators', 50, 300, log=True)
    }
    
    model = MultiOutputRegressor(RandomForestRegressor(**params), n_jobs=-1)
    
    cv_metrics = np.zeros(3)
    tscv = TimeSeriesSplit(n_splits=3, test_size=16)
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)
        y_pred[y_pred<0] = 0 #Fix undershooting
        
        cv_metrics[i] = mean_squared_log_error(y_valid, y_pred)
        
        trial.report(np.mean(cv_metrics[:i+1]), i)
        if trial.should_prune():
            raise optuna.TrialPruned()
        
    return np.mean(cv_metrics)

In [None]:
def optimize_model(n_trials=10):
    
    #Define Optuna Sampler and Pruner
    sampler = optuna.samplers.TPESampler()
    pruner  = optuna.pruners.MedianPruner(n_startup_trials=1, 
                                      n_warmup_steps=0, 
                                      interval_steps=1)
    
    study = optuna.create_study(sampler=sampler, pruner=pruner,
                                storage='sqlite:///results/lightgbm_hyperparams.db', 
                                load_if_exists=True, direction="minimize", 
                                study_name='Time_series_forecasting')
    
    study.optimize(optimize_model, n_trials=n_trials)
    params = {
        'random_state' : 42
    }    
    params.update(study.best_params)
    model = MultiOutputRegressor(LGBMRegressor(**params))
    model.fit(X, y)
    dump(model, open(f'models/model.pkl', 'wb'))
    
    return model

In [None]:
#model = optimize_model(n_trials=10) Optimize the model!

#Params from the Optuna optimization process
params = {
    'random_state' : 42,
    'n_estimators': 112
}
model = MultiOutputRegressor(RandomForestRegressor(**params), n_jobs=-1)
model.fit(X, y)

In [None]:
X_test = dp.out_of_sample(steps=16) #Next 16 steps
X_test.index.name = 'date'
X_test['NewYear'] = (X_test.index.dayofyear == 1)

y_pred = model.predict(X_test)
y_pred[y_pred<0] = 0 #Fix undershooting

y_submit = pd.DataFrame(y_pred, index=X_test.index, columns=y.columns)
y_submit = y_submit.stack(['store_nbr', 'family'])
y_submit = y_submit.join(test.id).reindex(columns=['id', 'sales'])
y_submit.to_csv('./submission.csv', index=False)