In [20]:
%run Forecasting_Models.ipynb
%run General_Functions.ipynb
%run Performance_Metrics.ipynb
%run Model_Tuning.ipynb

In [21]:
import pandas as pd
import numpy as np
from hyperopt import hp, tpe, Trials, fmin, STATUS_OK
from functools import partial
from pyspark.sql import *
from pyspark.conf import SparkConf
import mlflow

In [22]:
spark = SparkSession.builder \
        .master("local[4]") \
        .config("spark.submit.deployMode", "client") \
        .appName("UDP Nerdearla") \
        .getOrCreate()

### Nerdearla 2021 - E2E Unified Demand Planning: Demand Forecasting

This notebook contains the code of the Demand Forecasting Pipeline, which is the process used to tune (train +
validation) and back-test the models for each product (SKU) according to the different defined experiments. In this
context, an experiment is a scenario composed of:

  * Algorithm, such as Prophet, SARIMAX, XGBoost.
  * Set of input features (excluded from the scope of the workshop)

For each product this pipeline will train, tune, yield a best model (best set of hyperparameters) and backtest it for
each of the different experiments; which means that every product will have as many "best models" as the number of
experiments. The decision about which of these models to use as the final model is done by selecting the better on in
terms of the validation WAPE.

The result of this process consists of logging for all products the best model of each experiment into the Mlflow
tracking API and generating the forecast for the back-testing period with each one of these, after that only the
forecast corresponding to the best model among all the experiments is kept.

The functions included are:

| Function | Description |
| -------- | ----------- |
| `obtain_models` | defines and trains a SARIMAX model using the given hyperparameters and input time series |
| `obtain_prophet`  | defines and trains a Prophet model using the given hyperparameters and input time series |


###### Initializing variables

In [23]:
# Experiment variables
algorithms = ["prophet", "sarimax"]
holidays = False
num_evals = 20

# Dates for validation
start_val = "1956-04-01"
end_val = "1958-08-01"

# Dates for testing
start_test = "1958-09-01"
end_test = "1960-12-01"

###### Defining search space of each algorithm

In [24]:
# Defining search space for prophet
params_prophet = {
    "changepoint_prior_scale":  hp.loguniform("changepoint_prior_scale", np.log(0.001), np.log(0.5)),
    "seasonality_prior_scale": hp.loguniform("seasonality_prior_scale", np.log(0.01), np.log(10)),
    "holidays_prior_scale": hp.loguniform("holidays_prior_scale", np.log(0.01), np.log(10)),
}

# Defining search space for sarimax
params_sarimax = {
    "p": hp.choice("p", [0, 1, 2]),
    "d": hp.choice("d", [0, 1]),
    "q": hp.choice("q", [0, 1, 2]),
    "P": hp.choice("P", [0, 1, 2]),
    "D": hp.choice("D", [0, 1]),
    "Q": hp.choice("Q", [0, 1, 2]),
    "s": 12
}

###### Defining modeling function

In [25]:
def obtain_models(data):
    """

    Parameters
    __________
        df_train (pd.DataFrame): Dataset with training time series.
        params (dict): Dictionary with seasonal and non-seasonal order parameters of the model.
        holidays (bool, defaults to False): Flag to indicate whether the dataset contains the holidays regressor or not.

    Returns
    _______
        model (ARIMAResultsWrapper): Object with the SARIMAX model defined by "params" and trained with the given time
            series.
    """
    # Ensuring order of observations
    data = data.sort_values(by="ds", ascending=True).reset_index(drop=True)

    # Splitting the series
    df_trainval, df_test = split_series(data, start_test, end_test)

    # Looping over the algorithms
    for algorithm in algorithms:
        # Validating the algorithm to use
        if algorithm == "sarimax":
            search_space = params_sarimax
        elif algorithm == "prophet":
            search_space = params_prophet

        # Tuning the model for each algorithm
        results = tune_ts_model(
            algorithm, search_space, num_evals, df_trainval, start_val, end_val, holidays=holidays, df_frds=None
        )

        print(results)

    # Defining and training SARIMAX model
    model = "RTM"

    return model

##### Training pipeline main code

###### 1. Loading the preprocessed data from Delta table

In [26]:
df = pd.read_csv(r"..\Data\AirPassengers.txt", sep=',', index_col=None, header=0)
df["ds"] = pd.to_datetime(df["ds"])

abc, bcd = obtain_models(df)
#df_data = spark.read.csv(r"..\Data\AirPassengers.txt", sep=',', header=True, inferSchema=True)

INFO:numexpr.utils:NumExpr defaulting to 4 threads.
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  .astype(np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  .astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  .astype(np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  .astype(np.float)
INFO:hyperopt.tpe:build_posterior_wrapper took 0.002001 seconds
INFO:hyperopt.tpe:TPE using 1/1 trials with best loss 7.541739
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
Deprecated in NumPy 1.20; for more details and 

{'train_wape': 3.998312284349654, 'val_wape': 7.462596926474789, 'params': {'changepoint_prior_scale': 0.09487458948953541, 'holidays_prior_scale': 5.731390816781686, 'seasonality_prior_scale': 0.24223971464655175}}


INFO:hyperopt.tpe:build_posterior_wrapper took 0.004000 seconds
INFO:hyperopt.tpe:TPE using 2/2 trials with best loss 4.640643
INFO:hyperopt.tpe:build_posterior_wrapper took 0.003990 seconds
INFO:hyperopt.tpe:TPE using 3/3 trials with best loss 4.640643
INFO:hyperopt.tpe:build_posterior_wrapper took 0.009001 seconds
INFO:hyperopt.tpe:TPE using 4/4 trials with best loss 4.640643
INFO:hyperopt.tpe:build_posterior_wrapper took 0.002998 seconds
INFO:hyperopt.tpe:TPE using 5/5 trials with best loss 4.640643
  params_variance = (residuals[k_params_ma:] ** 2).mean()
  ret = ret.dtype.type(ret / rcount)
INFO:hyperopt.tpe:build_posterior_wrapper took 0.003002 seconds
INFO:hyperopt.tpe:TPE using 6/6 trials with best loss 4.640643
INFO:hyperopt.tpe:build_posterior_wrapper took 0.006000 seconds
INFO:hyperopt.tpe:TPE using 7/7 trials with best loss 4.604391
INFO:hyperopt.tpe:build_posterior_wrapper took 0.004998 seconds
INFO:hyperopt.tpe:TPE using 8/8 trials with best loss 4.589706
INFO:hyperopt.tp

{'train_wape': 4.955516381036997, 'val_wape': 3.9866262587260684, 'params': {'D': 0, 'P': 1, 'Q': 1, 'd': 0, 'p': 2, 'q': 0, 's': 12}}


ValueError: too many values to unpack (expected 2)