In [0]:
%run "./Imports"

In [0]:
%run "./Model_Tuning"

In [0]:
%run "./General_Functions"

### Nerdearla 2021 - E2E Unified Demand Planning: Orders Forecasting

This notebook contains the code of the Orders Forecasting Pipeline, which is the process used to tune (train +
validation) and back-test the regression models for each product (SKU).

In order to analyze the behavior of the supply chain in an end to end fashion, the regression models used receive
as input not only information from orders but also from demand as well.

The result of this process consists of logging for all products their best model into the Mlflow tracking API and 
generating the forecast for the back-testing period with that model.

The functions included are:

| Function | Description |
| -------- | ----------- |
| `obtain_models` | Obtains the best regression model per product |

###### Initializing variables

In [0]:
# Experiment variables
algorithms = ["lightgbm"]
holidays = False
num_evals = 2

# Dates for validation
start_val = "2019-01-01"
end_val = "2019-06-30"

# Dates for testing
start_test = "2019-07-01"
end_test = "2019-12-31"

###### Defining search space of each algorithm

In [0]:
# Defining search space for LightGBM
params_lightgbm = {
    "n_estimators":  hp.randint("n_estimators", 15, 200),
    "max_depth": hp.randint("max_depth", 3, 50),
    "learning_rate": hp.choice("learning_rate", [0.01, 0.05, 0.1, 0.2]),
    "reg_alpha": hp.choice("reg_alpha", [0.2, 1, 5, 10]),
    "reg_lambda": hp.choice("reg_lambda", [0.2, 1, 5, 10])
}

###### Setting Mlflow experiment

In [0]:
# Defining experiment path
mlflow_exp =  r"/Users/n.garcia.aramouni@accenture.com/UDP_E2E_Forecasting/nerdearla_udp_orders"

# Launching Mlflow client
client = MlflowClient()

# Creating experiment or re-using it if already exists
experiment = client.get_experiment_by_name(mlflow_exp)
if experiment is None:
    exp_id = mlflow.create_experiment(mlflow_exp)
else:
    exp_id = experiment.experiment_id

###### Defining modeling function

In [0]:
def obtain_models(data, df_frds):
    """
    For each product, obtains the best forecasting model per experiment; where an experiment is defined by an algorithm
    in the context of the workshop.

    Obtaining the best model of each experiment is done by performing hyperparameter tuning, which involves training and
    validating multiple sets of hyperparameters to then select the best performing set according to a specific metric on
    the validation set. Finally, all the best models from the experiments are used to generate the forecast for the
    back-testing set and their performances on that set are recorded.

    Results for each experiment such as train WAPE, validation WAPE and test WAPE are logged into Mlflow.

    Parameters
    __________
        data (pd.DataFrame): Dataset with the time series of the product.
        df_frds (pd.DataFrame): Dataset with holidays.

    Returns
    _______
        df_forecasts (pd.DataFrame): Table with the forecasts of the best models for the back-testing set.
    """
    # Ensuring order of observations
    data = data.sort_values(by="ds", ascending=True).reset_index(drop=True)
    
    # Obtaining product info
    sku = data["n_sku"][0]
    
    # Generate orders lags
    data = create_orders_lags(data)

    # Splitting the series
    df_trainval, df_test = split_series(data, start_test, end_test)

    # Defining the output object
    df_forecasts = pd.DataFrame()
    
    # Define algorithm
    algorithm = "lightgbm"
    
    # We define the columns that we use for prediction, which have forecasts for consumption
    # These columns will be
    # - "ds" -> date
    # - "y_lag_i" -> Orders lag i
    # - "lag_i" -> Consumption lag i
    # - "lead_i" -> Consumption lead i
    # - "y_dmd" -> Consumption for period equal to ds
    # - "y" -> Orders for period ds. This is the target column
    tuning_cols = ["ds", 
                   "y_lag_1", "y_lag_2", "y_lag_3", "y_lag_4", "y_lag_5", "y_lag_6", "y_lag_7", 
                   "lag_1", "lag_2", "lag_3", "lag_4", "lag_5", "lag_6", "lag_7",
                   "lead_1", "lead_2", "lead_3", "lead_4", "lead_5", "lead_6", "lead_7",
                   "y_dmd",
                   "y"]

    # Tuning the model
    results = tune_ts_model(
        algorithm, params_lightgbm, num_evals, df_trainval[tuning_cols], start_val, end_val, holidays=holidays, df_frds=None
    )
    
    # We change the list of columns that will be used to predict the testing period
    # The difference erradicates in the fact that for the testing period, we will use forecast information for consumption
    # The columns that has consumption forecasts in the forecasting period (and actuals during the training period) 
    # have the name "lag_i_mod"
    fcsting_cols = ["ds", 
                    "y_lag_1", "y_lag_2", "y_lag_3", "y_lag_4", "y_lag_5", "y_lag_6", "y_lag_7", 
                    "lag_1_mod", "lag_2_mod", "lag_3_mod", "lag_4_mod", "lag_5_mod", "lag_6_mod", "lag_7_mod",
                    "lead_1_mod", "lead_2_mod", "lead_3_mod", "lead_4_mod", "lead_5_mod", "lead_6_mod", "lead_7_mod",
                    "y_dmd_mod",
                    "y"]    

    # Re-fitting model and generating forecast
    df_fcst_alg, test_wape = refit_generate_forecast(
        algorithm, results["params"], df_trainval[fcsting_cols], df_test[fcsting_cols], holidays, df_frds
    )

    # Appending forecast to the output object
    df_forecasts = df_forecasts.append(df_fcst_alg)

    # Starting run and assigning tags
    mlflow.start_run(experiment_id=exp_id, run_name=str(sku))
    mlflow.set_tags(
       {
        "experiment": "Nerdearla 2021",
        "product": sku,
        "algorithm": algorithm
       }
    )

    # Logging results in Mlflow
    mlflow.log_metrics({"train_wape": results["train_wape"], "val_wape": results["val_wape"], "test_wape": test_wape})

    # Ending run
    mlflow.end_run()

    # Adding identification column
    df_forecasts["n_sku"] = sku

    return df_forecasts

##### Orders forecasting pipeline main code

###### 1. Loading the data from DBFS

In [0]:
# Loading orders data from DBFS 
df_ord = spark.read.csv(r"/FileStore/tables/raw_orders_data_v1_clean.csv", sep=',', header=True, inferSchema=True)
df_ord = df_ord.withColumn("ds", df_ord['ds'].cast(DateType()))

# Loading demand data from DBFS 
df_dmd = spark.read.csv(r"/FileStore/tables/raw_consumption_data_clean.csv", sep=',', header=True, inferSchema=True)
df_dmd = df_dmd.withColumn("ds", df_dmd['ds'].cast(DateType())) \
    .withColumnRenamed("y", "y_dmd")

# Loading demand forecasts from DBFS
df_dmd_fcst = spark.read.format("delta").load(r"dbfs:/FileStore/results/demand_forecasts") \
    .select(["n_sku", "ds", "fcst"]) \
    .withColumnRenamed("fcst", "y_dmd")

###### 2. Preparing dataset

In [0]:
# Creating filtered copy of demand actuals
df_dmd_fltr = df_dmd.filter(df_dmd["ds"] < pd.to_datetime(start_test))

# Concatenating filtered demand actuals with forecasts
df_dmd_actfcs = df_dmd_fltr.union(df_dmd_fcst)

# Creating leads and lags
df_dmd_shift = shift_time_series(df_dmd, "y_dmd", lag_lead=7, suf="")
df_dmd_actfcs_shift = shift_time_series(df_dmd_actfcs, "y_dmd", lag_lead=7, suf="_mod") \
    .withColumnRenamed("y_dmd", "y_dmd_mod")
    
# Joining both DFs to create a single one
df_dmd_prp = df_dmd_shift.join(df_dmd_actfcs_shift, ["n_sku", "ds"], how="left")

# Adding demand features to orders data
df_ord = df_ord.join(df_dmd_prp, ["n_sku", "ds"], how="left")

In [0]:
display(df_ord)

n_sku,ds,y,y_dmd,lag_1,lead_1,lag_2,lead_2,lag_3,lead_3,lag_4,lead_4,lag_5,lead_5,lag_6,lead_6,lag_7,lead_7,y_dmd_mod,lag_1_mod,lead_1_mod,lag_2_mod,lead_2_mod,lag_3_mod,lead_3_mod,lag_4_mod,lead_4_mod,lag_5_mod,lead_5_mod,lag_6_mod,lead_6_mod,lag_7_mod,lead_7_mod
1,2016-02-14,7388.16,7104.0,8129.0,9386.0,9570.0,9241.0,9842.0,9010.0,9941.0,9282.0,8324.5,9326.0,8456.0,8102.0,4629.0,6443.0,7104.0,8129.0,9386.0,9570.0,9241.0,9842.0,9010.0,9941.0,9282.0,8324.5,9326.0,8456.0,8102.0,4629.0,6443.0
1,2016-03-06,6772.5,6450.0,8667.0,9212.0,9675.0,9302.0,9844.0,9064.0,9468.0,9575.0,9186.0,10074.0,9130.0,8613.0,6831.0,6673.0,6450.0,8667.0,9212.0,9675.0,9302.0,9844.0,9064.0,9468.0,9575.0,9186.0,10074.0,9130.0,8613.0,6831.0,6673.0
1,2016-06-06,9061.47,9153.0,6408.0,9690.0,8373.0,9840.0,10975.0,10144.0,9379.0,10561.0,9558.0,8656.0,8927.0,6166.0,8861.0,9106.0,9153.0,6408.0,9690.0,8373.0,9840.0,10975.0,10144.0,9379.0,10561.0,9558.0,8656.0,8927.0,6166.0,8861.0,9106.0
5,2016-07-09,3775.8,2829.0,3596.0,1611.0,3566.0,2536.0,3156.0,2850.0,2798.0,3032.0,973.0,3011.0,1719.0,3245.0,2878.0,3297.0,2829.0,3596.0,1611.0,3566.0,2536.0,3156.0,2850.0,2798.0,3032.0,973.0,3011.0,1719.0,3245.0,2878.0,3297.0
4,2016-07-31,3464.92,3364.0,4648.0,8236.0,8935.0,8479.0,8953.0,8468.0,8429.0,8897.0,7736.0,8968.0,7967.0,4360.0,3888.0,3984.0,3364.0,4648.0,8236.0,8935.0,8479.0,8953.0,8468.0,8429.0,8897.0,7736.0,8968.0,7967.0,4360.0,3888.0,3984.0
5,2016-11-04,2674.35,3862.0,2547.0,3913.0,2421.0,2102.0,2180.0,2089.0,2052.0,2473.0,2105.0,2530.0,3216.0,2684.0,2968.0,3060.0,3862.0,2547.0,3913.0,2421.0,2102.0,2180.0,2089.0,2052.0,2473.0,2105.0,2530.0,3216.0,2684.0,2968.0,3060.0
4,2017-01-14,4296.32,4384.0,8910.0,3671.0,8632.0,8555.0,8489.0,8347.0,8337.0,8329.0,8335.0,8904.0,3354.0,8771.0,4453.0,4158.0,4384.0,8910.0,3671.0,8632.0,8555.0,8489.0,8347.0,8337.0,8329.0,8335.0,8904.0,3354.0,8771.0,4453.0,4158.0
4,2017-02-19,3243.0,3450.0,3989.0,7860.0,7712.0,8526.0,8784.0,8335.0,8671.0,8718.0,8801.0,8553.0,8457.0,4771.0,3398.0,2876.0,3450.0,3989.0,7860.0,7712.0,8526.0,8784.0,8335.0,8671.0,8718.0,8801.0,8553.0,8457.0,4771.0,3398.0,2876.0
2,2017-05-17,68028.87,76033.0,74757.0,76975.0,72006.0,78315.0,50578.0,60456.0,61016.0,46150.0,79399.0,72326.0,77827.0,75799.0,79367.0,81457.0,76033.0,74757.0,76975.0,72006.0,78315.0,50578.0,60456.0,61016.0,46150.0,79399.0,72326.0,77827.0,75799.0,79367.0,81457.0
1,2017-07-14,9089.6,9880.0,9233.0,8700.0,9953.0,6385.0,10479.0,9524.0,9416.0,10282.0,6089.0,9743.0,9416.0,10485.0,9692.0,10667.0,9880.0,9233.0,8700.0,9953.0,6385.0,10479.0,9524.0,9416.0,10282.0,6089.0,9743.0,9416.0,10485.0,9692.0,10667.0


###### 3. Performing modeling of SKUs

In [0]:
# Defining schema of the resulting dataframe:
result_schema = StructType(
    [
     StructField("algorithm", StringType(), False),
     StructField("ds", DateType(), False),
     StructField("fcst", FloatType(), False),
     StructField("n_sku", IntegerType(), False)
    ]
)

# Performing modeling of the DFUs
df_fcsts = df_ord.groupBy("n_sku") \
    .applyInPandas(
        lambda df: obtain_models(df, holidays),
        result_schema
    ) \
    .persist(StorageLevel.MEMORY_ONLY)

# Adding identification key of experiments
df_fcsts = df_fcsts.withColumn("exp_key", concat(df_fcsts["n_sku"], lit("_"), df_fcsts["algorithm"]))
display(df_fcsts)

algorithm,ds,fcst,n_sku,exp_key
lightgbm,2019-07-01,8971.403,1,1_lightgbm
lightgbm,2019-07-02,9584.382,1,1_lightgbm
lightgbm,2019-07-03,9732.199,1,1_lightgbm
lightgbm,2019-07-04,9847.086,1,1_lightgbm
lightgbm,2019-07-05,9778.062,1,1_lightgbm
lightgbm,2019-07-06,7856.1504,1,1_lightgbm
lightgbm,2019-07-07,6203.0586,1,1_lightgbm
lightgbm,2019-07-08,9209.256,1,1_lightgbm
lightgbm,2019-07-09,9597.343,1,1_lightgbm
lightgbm,2019-07-10,9723.968,1,1_lightgbm


###### 4. Saving results on Delta

In [0]:
# Writing results to Delta
df_fcsts.write.mode("overwrite")\
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save("dbfs:/FileStore/results/orders_forecasts")