## Timeseries Modeling - Prophet
### Objective:
The objective of the notebook is to -
* Backtest on all hyperparameters of Prophet provided in the config
* Find the best set of hyperparameters using the metric provided in the config

In [0]:
import yaml
import inspect
import glob
import numpy as np
import pandas as pd
from prophet import Prophet
from prophet.make_holidays import make_holidays_df
from distutils.command.config import config
from tqdm.auto import tqdm
from datetime import timedelta
from datetime import datetime
import mlflow
from sklearn.metrics import mean_absolute_error,mean_squared_error
import os
import logging
import dotsi

In [0]:
# logging part
p_dir = '/tmp/'
log_file = "Prophet_hyperparameter_tuning" + " (" +datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+ ").log"

# Prophet - Hyperparameter tuning logs
logger = logging.getLogger('custom_log')
logger.setLevel(logging.DEBUG)

# Applying necessary formatter
fh = logging.FileHandler(p_dir+log_file)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)

In [0]:
# Getting the default settings of hyperparameters. Used to check that user-provided hyperparameters must always be a subset of these.
def get_default_args(func) -> dict:
    """Function to get the default values of the hyperparameters for the given algorithm

    Parameters
    ----------
    func : constructor of the respective algorithm
        The name of the algorithm (Eg: Prophet,SARIMAX)

    Returns
    -------
    dict
        returns a dictionary of hyperparameters and the corresponding default values for the given algorithm
    """
    
    signature = inspect.signature(func)
    return {
        k: v.default if v.default is not inspect.Parameter.empty else None
        for k, v in signature.parameters.items()
        if k != 'self'
    }

#### Configurable Hyperparameters
The following are the possible hyperparameters that can be tuned for Prophet. The preferred hyperparameters, and their respective search spaces, over which tuning is to be done need to be mentioned in the config file.

In [0]:
# Default values for the hyperparameters in Prophet
default_hpps = get_default_args(Prophet)
default_hpps

#### Broadcast helper functions
These functions helps to persist the data in all the workers so that we can leverage them in UDF while distributed processing

In [0]:
def broadcast_holidays(
    config_holidays: dict,
    year_list: list =[2018, 2019, 2020, 2021, 2022],
    country_name: str ="US",
    holiday_lower_window: int =7,
    holiday_upper_window: int =7,
) -> pd.DataFrame:
    """Function to return the dataframe of holidays for the given time period using Prophet's make_holidays_df()

    Parameters
    ----------
    config_holidays : dict
        the additional list of holidays and its respective dates provided by the user in config file
    year_list : list, optional
        the list of years for which we need the holidays , by default [2018, 2019, 2020, 2021, 2022]
    country_name : str, optional
        Name of the country based on which holidays can be decided, by default "US"
    holiday_lower_window : int, optional
        lower limit of the window, by default 7
    holiday_upper_window : int, optional
        upper limit of the window, by default 7

    Returns
    -------
    pd.DataFrame
        Returns a dataframe of holidays for the given time period
    """
    holidays = make_holidays_df(year_list, country_name)
    # Add window
    holidays['lower_window'] = -holiday_lower_window
    holidays['upper_window'] = holiday_upper_window
    
    # Adding additional holidays
    if config_holidays is not None:
        for ad_hol in config_holidays.keys():
            temp_df = pd.DataFrame({'holiday':ad_hol,
                                    'ds': pd.to_datetime(config_holidays[ad_hol]['ds']),
                                    'lower_window': -holiday_lower_window,
                                    'upper_window': holiday_upper_window})
            holidays = pd.concat([holidays,temp_df])
    
    # Dropping duplicates if exists any
    holidays = holidays.drop_duplicates().reset_index(drop = True)
    return holidays

#### Processing Config file
Dependent variable, date variable, modeling granularity & other related modeling details are provided in the form of a config file.Each TS Algorithm and the related hyperparameter values to be tried should given in the config.yml file

In [0]:
%run ../../../0_Config.ipynb

In [0]:
logger.info("Config file read")
assert set(app_config["Algorithms"]["Prophet"]["Hyperparameters"].keys()).issubset(set(default_hpps.keys())),\
           'keys supplied by the user for the Prophet Algorithm under Hyperparameters must be valid'

# For exporting the config file
temp_config = app_config.copy()

In [0]:
def frange(start,stop,step= 1):
    l = []
    i = start
    while(i < stop):
        l.append(round(i,len(str(step))))
        i = i+step
    return l

def drange(hyperparameters):
    l=[]
    for key in hyperparameters.keys():
        val = hyperparameters[key]
        if 'range' in val:
            val = val.replace('range','frange')
            new_str = 'total_list = '  + val
            _locals = locals()
            exec(new_str,globals(),_locals)
            without_dup = list(set(_locals['total_list']))
            hyperparameters[key] = without_dup
    return hyperparameters

In [0]:
fit_ = drange(app_config["Algorithms"]["Prophet"]["Hyperparameters"])
fit_new = {}
for key in fit_.keys():
    temp = []
    for val in fit_[key]:
        if(type(val) == list):
            val = str(val)
        if((val!='None') and (val!='Null') and (val!=None)):
            temp.append(val)
    if(len(temp)>0):
        fit_new[key] = temp
        
app_config["Algorithms"]["Prophet"]["Hyperparameters"] = fit_new

In [0]:
# Create the algo and logs directory for storing the results
output_directory = app_config['output_dir_path']
root_dir = "Modeling_Results"
algorithm = "Prophet"
algo_path = os.path.join(output_directory,root_dir,algorithm)
if not os.path.exists(algo_path):
    os.makedirs(algo_path)
logger.info("Created algorithm directory")    

logs_path = os.path.join(output_directory,root_dir,'logs',algorithm)
if not os.path.exists(logs_path):
    os.makedirs(logs_path)
    
logger.info("Created logs directory")

config_path = os.path.join(app_config['output_dir_path'],"Modeling_Results","config")
if not os.path.exists(config_path):
    os.makedirs(config_path)
    
logger.info("Created config directory")

#### Broadcasting the required variables
Variables suffixed with "_conf" are taken from the config file

In [0]:
hyperparameters_conf = dict(app_config["Algorithms"]["Prophet"]["Hyperparameters"])
# print(hyperparameters_conf)

modeling_granularity_conf = app_config["modeling_granularity"]
# print(modeling_granularity_conf)

# Rename Start date and DV config
dv_config = app_config["dependent_variable"]
ds_config = app_config["date_var"]

# pos and neg corr broadcast
corr_config = dict(app_config['Algorithms']['Prophet']['exogenous_variables'])
corr_config_broadcast = dotsi.Dict({"value":corr_config})

# Eval metric broadcast
broadcast_metric = dotsi.Dict({"value":app_config["validation"]["metric"]})

broadcast_tracking = dotsi.Dict({"value":app_config['tracking']})
broadcast_test_periods = dotsi.Dict({"value":app_config["validation"]["no_of_backtesting_test_periods"]})
# ===================================================================================

# Broadcasting
if app_config["Algorithms"]["Prophet"]["Holidays"]["include_holidays"] == True:
    aa = app_config["Algorithms"]["Prophet"]["Holidays"]
    holidays_broadcast = broadcast_holidays(aa['additional_holidays'],aa['years'],aa['country'],aa['holiday_lower_window'],aa['holiday_upper_window'])
    holidays_broadcast = dotsi.Dict({"value":holidays_broadcast})
else:
    holidays_broadcast = dotsi.Dict({"value":None})
    
broadcast_regressor_mode = dotsi.Dict({"value":app_config["Algorithms"]["Prophet"]["regressor_mode"]})
broadcast_granularity = dotsi.Dict({"value":modeling_granularity_conf})
broadcast_hyper_parameters = dotsi.Dict({"value":hyperparameters_conf})

logger.info("Broadcasted the required variables")

In [0]:
# Reading feature selected output and using the significant variables as idvs in modeling
feature_selection_info = app_config['Algorithms']['Prophet']['feature_selection']
broadcast_use_features = dotsi.Dict({"value":feature_selection_info['use_feature_selected_idvs']})
if(feature_selection_info['use_feature_selected_idvs']):
    if(feature_selection_info['approach']=='lasso_cvglmnet'):
        output_folder = app_config['output_dir_path']+"/Feature_Selection/Lasso/"
    # Reading the latest input file based on timestamp
    coeff_op_files = [file for file in os.listdir(output_folder)]
    coeff_op_files = [file.replace(".csv","") for file in coeff_op_files]
    version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in coeff_op_files]
    max_date = max(version_dates)
    max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
    req_file_name = [x for x in coeff_op_files if max_date in x]
    coeff_op_file_path = os.path.join(output_folder,req_file_name[0]+".csv")
    print(coeff_op_file_path)

    # Reading the data
    coeff_df = pd.read_csv(coeff_op_file_path)
    coeff_df = coeff_df[coeff_df['status']=='success']
    # print(coeff_df.shape)
    coeff_df[modeling_granularity_conf] = coeff_df[modeling_granularity_conf].astype(str)
    idvs_len = len(feature_selection_info['must_have_idvs'])
    if(idvs_len>0):
        temp1 = coeff_df[modeling_granularity_conf].drop_duplicates()
        temp1['temp'] = 1
        temp2 = pd.DataFrame({'IDV':feature_selection_info['must_have_idvs']})
        temp2['temp'] = 1
        temp = temp1.join(temp2, on = 'temp', how ='left')
        req_cols = modeling_granularity_conf + ['IDV']
        coeff_df = coeff_df.drop_duplicates()
    coeffs_broadcast = dotsi.Dict({"value":coeff_df})
# display(coeff_df)

#### Pandas UDF for Backtesting
The UDF gets executed in multiple worker nodes to parallelize the backtesting process. All the broadcasted variables are accessed within the UDF as and when required

In [0]:
def get_prediction_UDF(df_data: pd.DataFrame) -> pd.DataFrame:
    """Function to perform backtesting for the given input data using the broadcasted information from the config file

    Parameters
    ----------
    df_data : pd.DataFrame
        The dataset containing values for all the required variables

    Returns
    -------
    pd.DataFrame
        Returns a dataframe with the granularity,date,hyperparameters,window and performance metrics
    """
    try:
        train_index_start = df_data["train_index_start"].iloc[0]
        train_index_end = df_data["train_index_end"].iloc[0]
        test_i = df_data["test_index_end"].iloc[0]
        window_no = df_data["window_no"].iloc[0]
        regressor_mode = broadcast_regressor_mode.value
        
        df_data = df_data.sort_values(by=['ds'],ascending=True)
        df_data = df_data.iloc[train_index_start:test_i].reset_index(drop=True)

        # broadcast_granularity
        broadcast_gran = broadcast_granularity.value

        # Train - test split
        test_periods = int(broadcast_test_periods.value)
        train = df_data.iloc[:-test_periods]
        test = df_data.iloc[-test_periods:]

        # Updating the default arguments with the parameters provided in the config
        hp_config = broadcast_hyper_parameters.value
        def_args = get_default_args(Prophet)
        for x in hp_config:
            def_args[x] = df_data[x].iloc[0]
        if holidays_broadcast.value is not None:
            def_args["holidays"] = holidays_broadcast.value
        if(regressor_mode not in ['additive','multiplicative']):
            regressor_mode = def_args['seasonality_mode'] 
        # Calling the Prophet constructor with the hyperparameters of interest
        m = Prophet(**def_args)

        if(broadcast_use_features.value==True):
            # Reading regressors from feature selection
            coeffs_df = coeffs_broadcast.value
            for x in broadcast_gran:
                coeffs_df = coeffs_df[coeffs_df[x] == df_data[x].iloc[0]]
            regressors = list(coeffs_df['IDV'].values)
        else:
            # Appending regressors based on the sign of correlation
            corr_var = corr_config_broadcast.value
            regressors = list(set(corr_var["positive_corr"] + corr_var["negative_corr"]+corr_var['uncertain_corr']))

            # Removing regressors based on the correlation
            if(corr_var["consider_correlation"]):    
                temp_list = []
                for x in corr_var["positive_corr"]:
                    if(train[['y',x]].corr().iloc[0][1]<0):
                        temp_list.append(x)
                for x in corr_var["negative_corr"]:
                    if (x not in temp_list):
                        if(train[['y',x]].corr().iloc[0][1]>0):
                            temp_list.append(x)   
                regressors = list(set(regressors) - set(temp_list)) 
            # Checking for variance in the regressor
            temp_list = []
            if len(regressors)>0:
                for ex_var in regressors:
                    mean = train[ex_var].mean()
                    std = train[ex_var].std()
                    if mean == 0:
                        if std <= 0.001:
                            temp_list.append(ex_var)
                    else:
                        if abs(std/mean) <= 0.01:
                            temp_list.append(ex_var)
            regressors = list(set(regressors) - set(temp_list)) 
            
        for var in regressors:
            m.add_regressor(var,mode = regressor_mode)    
        m.fit(train)

        forecast_pd = m.predict(test)
        results_pd = forecast_pd[["ds", "yhat", "yhat_upper", "yhat_lower"]]
        results_pd = pd.merge(
            results_pd, test[["y", "ds"] + broadcast_gran], how="left", on="ds"
        )
        y_pred = results_pd["yhat"]
        y_true = results_pd["y"]

        # correct y_pred to 0 if -ve
        y_pred = np.where(y_pred < 0, 0, y_pred)

        # to handle erroneous results epsilon is set to 1.
        epsilon = 1

        # Eval. metrics calculation
        results_pd["mape"] = (np.mean(np.abs(y_true - y_pred) / np.maximum(np.abs(y_true), epsilon)) * 100)
        results_pd["wmape"] = (np.sum(np.abs(y_true - y_pred))/ np.maximum(np.sum(np.abs(y_true)), epsilon)* 100)
        results_pd["bias"] = np.mean((y_true - y_pred))
        results_pd["tracking_signal"] = np.sum((y_true - y_pred)) / np.mean(np.abs(y_true - y_pred))
        results_pd["mae"] = mean_absolute_error(y_true, y_pred)
        results_pd["rmse"] = np.sqrt(mean_squared_error(y_true, y_pred))

        # To adhere to defined schema
        for x in broadcast_gran:
            results_pd[x] = results_pd[x].astype(str)

        # Append Hyperparameters used
        for x in hp_config:
            results_pd[x] = df_data[x].iloc[0]

        results_pd["window"] = str(str(train_index_start)+" "+str(train_index_end)+" "+str(test_i)+" "+str(window_no))

        # Sales or Quantity can't be negative hence
        results_pd["yhat"] = np.where(results_pd["yhat"] < 0, 0, results_pd["yhat"])
        results_pd["yhat_upper"] = np.where(results_pd["yhat_upper"] < 0, 0, results_pd["yhat_upper"])
        results_pd["yhat_lower"] = np.where(results_pd["yhat_lower"] < 0, 0, results_pd["yhat_lower"])
        results_pd["status"] = "success"
        return results_pd

    except Exception as e:
        results_pd = pd.DataFrame(columns=[["ds","y","yhat","yhat_upper","yhat_lower","mape","wmape","bias","tracking_signal","mae","rmse","window"]+\
                                           list(broadcast_hyper_parameters.value.keys())+["status"]+ broadcast_granularity.value],index=range(1))
        results_pd[broadcast_granularity.value] = df_data[broadcast_granularity.value].head(1).reset_index(drop=True)
        for x in broadcast_granularity.value:
            results_pd[x] = results_pd[x].astype(str)
        results_pd["status"] = str(e)
        return results_pd

#### Hyperparameter grid
The following function creates a cross product of all the hyperparameters provided in the config file and returns a hyperparamter grid

In [0]:
# Creating a hyperparameter grid using the set of provided hyperparameters in the config file
def create_hyperparam_space(hp_space: dict) -> pd.DataFrame:
    """Function to create a hyperparameter grid using the set of provided hyperparameters in the config file to be used for Hyperparameter tuning

    Parameters
    ----------
    hp_space : dict
        The set of provided hyperparameters in the config file

    Returns
    -------
    pd.DataFrame
        Returns a hyperparameter grid created from the set of provided hyperparameters
    """    
    df_list = list()
    for x in hp_space:
        df_list.append(pd.DataFrame({x:hp_space[x]}))

    space=df_list[0]

    for x in df_list[1:]:
        space = space.merge(x,how="cross")

    return space

#### Backtesting parallelized using UDF
Based on the Backtesting algorithm (sliding_window/expanding_window), training percentage & test periods specified in the config file, Hyperparameter tuning is performed

In [0]:
def hyperparameter_tuning(
    algorithm: str,
    train_data: pd.DataFrame,
    hyperparam_space: dict,
    modeling_granularity_conf: list,
    train_percentage: float,
    backtesting_test_periods: int,
    test_periods: int,
    stride: int,
    modelling_func: pd.DataFrame = get_prediction_UDF,
) -> pd.DataFrame:
    
    """Function to perform the hyperparameter tuning

    Parameters
    ----------
    algorithm : str
        the algorithm with which the hyperparameter tuning is to be performed(sliding_window/expanding_window)
    train_data : pd.DataFrame
        the modelling dataset
    hyperparam_space : dict
        hyperparameter grid which was created using the set of hyperparameter from config file
    modeling_granularity_conf : list
        The list of the granularity variables at which the models will be built
    train_percentage : float
        The percentage of data points for the training
    backtesting_test_periods : int
        number of time periods to score in each iteration of backtesting
    test_periods : int
        number of time periods to score in final model building
    stride : int
        number of time periods to stride in each iteration of Backtesting
    modelling_func : pd.DataFrame, optional
        the function name to perform backtesting for the given dataset, by default get_prediction_UDF

    Returns
    -------
    pd.DataFrame
        Returns the dataframe containing the hyperparameter set of modelling granularity x hyperparameter set x window level
    """
    if algorithm == "expanding_window":
        gbcp = list(modeling_granularity_conf) + list(hyperparam_space.columns)
        unique_skuXds = train_data[modeling_granularity_conf+["#total_weeks"]].drop_duplicates().reset_index(drop = True)
        
        final_list = []
        gran_len = len(modeling_granularity_conf)
        for row1 in range(0,len(unique_skuXds)): 
            Total_weeks = unique_skuXds.loc[row1,'#total_weeks']
            train_interval = int((Total_weeks-test_periods) * train_percentage)
            j = 0
            train_period_ends = Total_weeks-test_periods
            for train_i in range(train_interval,train_period_ends,stride):
                if(train_i+backtesting_test_periods <=train_period_ends):
                    test_i = train_i+backtesting_test_periods
                    final_list.append([unique_skuXds.iloc[row1,index] for index in range(gran_len)] + [0,train_i,train_i+backtesting_test_periods,j+1])
                    j += 1
                    
        # create all windows combination.
        df_windows = pd.DataFrame([tuple(x) for x in final_list],columns =modeling_granularity_conf+\
                                  ['train_index_start','train_index_end','test_index_end','window_no'])
        f_df = train_data.merge(df_windows,on=modeling_granularity_conf,how="left")
        f_df['temppp'] = 1
        hyperparam_space['temppp'] = 1
        f_df = f_df.merge(hyperparam_space,on='temppp',how="left")
        f_df['gran_tempp'] = f_df[gbcp+["train_index_start","train_index_end","test_index_end","window_no"]].astype(str).sum(axis=1)
        unique_pdts = f_df['gran_tempp'].unique()
        new_results = pd.DataFrame()
        for pdt in unique_pdts:
            new_results = pd.concat([new_results,modelling_func(f_df[f_df['gran_tempp']==pdt])])
        return new_results
    
    elif algorithm == "sliding_window":
        gbcp = list(modeling_granularity_conf) + list(hyperparam_space.columns)
        unique_skuXds = train_data[modeling_granularity_conf+["#total_weeks"]].drop_duplicates().reset_index(drop = True)
        
        final_list = []
        gran_len = len(modeling_granularity_conf)
        for row1 in range(0,len(unique_skuXds)): 
            Total_weeks = unique_skuXds.loc[row1,'#total_weeks']
            train_interval = int((Total_weeks-test_periods) * train_percentage)
            j = 0
            train_period_ends = Total_weeks-test_periods
            train_index_start = 0
            for train_i in range(train_interval,train_period_ends,stride):
                if(train_i+backtesting_test_periods <=train_period_ends):
                    test_i = train_i+backtesting_test_periods
                    final_list.append([unique_skuXds.iloc[row1,index] for index in range(gran_len)] + \
                                      [train_index_start,train_i,train_i+backtesting_test_periods,j+1])
                    j += 1
                    train_index_start = train_index_start+stride
                    
        # create all windows combination.
        df_windows = pd.DataFrame([tuple(x) for x in final_list],columns =modeling_granularity_conf+\
                                  ['train_index_start','train_index_end','test_index_end','window_no'])
        f_df = train_data.merge(df_windows,on=modeling_granularity_conf,how="left")
        f_df['temppp'] = 1
        hyperparam_space['temppp'] = 1
        f_df = f_df.merge(hyperparam_space,on='temppp',how="left")
        f_df['gran_tempp'] = f_df[gbcp+["train_index_start","train_index_end","test_index_end","window_no"]].astype(str).sum(axis=1)
        unique_pdts = f_df['gran_tempp'].unique()
        new_results = pd.DataFrame()
        for pdt in unique_pdts:
            new_results = pd.concat([new_results,modelling_func(f_df[f_df['gran_tempp']==pdt])])
        return new_results

#### Loading the latest Missing_value_treatment file
##### Please update the reading path with the required data path if "Missing value treatment" was not run

In [0]:
# Reading the latest input file based on timestamp
all_files = [file for file in os.listdir(app_config['output_dir_path']+"/Data_Processing/Missing_value_treatment")]
missing_op_files = [file for file in all_files if "Missing_value_treatment_results (" in file]
missing_op_files = [file.replace(".csv","") for file in missing_op_files]
version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in missing_op_files]
max_date = max(version_dates)
max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
req_file_name = [x for x in missing_op_files if max_date in x]
missing_op_file_path = os.path.join(app_config['output_dir_path']+"/Data_Processing/Missing_value_treatment",req_file_name[0] + ".csv")
# print(missing_op_file_path)

# Reading the data
df = pd.read_csv(missing_op_file_path)
# print(df.shape)

df.rename(columns = {ds_config:"ds", dv_config:"y"}, inplace = True)
logger.info("Data loaded")

df['ds'] = pd.to_datetime(df['ds'])
df[modeling_granularity_conf] = df[modeling_granularity_conf].astype(str)

# Getting the total number of weeks for each time series
temp_df = df.groupby(modeling_granularity_conf).agg({'ds':'count'}).rename(columns={'ds': '#total_weeks'}).reset_index()
df = df.merge(temp_df, on = modeling_granularity_conf ,how = "left")

# 2. Create the hyperparameter space
hpspace = create_hyperparam_space(hyperparameters_conf)
logger.info("Created hyperparameter grid")

# 4.  HP tuning based on algorithm of choice
df_f = hyperparameter_tuning(app_config['validation']['backtesting']['algorithm'],\
                             df,hpspace,modeling_granularity_conf,\
                             app_config['validation']['train_percentage'],\
                             app_config['validation']['no_of_backtesting_test_periods'],\
                             app_config['validation']['no_of_test_periods'],\
                             app_config['validation']['backtesting']['stride'],modelling_func = get_prediction_UDF)

if holidays_broadcast.value is not None:
    holidays_df = holidays_broadcast.value
    holidays_df["ds"] = holidays_df["ds"].astype(str)
    df_f['holidays'] = holidays_df.reset_index(drop = True).to_json()
df_f.to_csv(algo_path + "/Backtesting_results_window_level (" + datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+").csv", index = False)
logger.info("Completed Backtesting")

### Choosing the best hyperparameters

In [0]:
# Reading the latest file based on timestamp
all_files = [file for file in os.listdir(algo_path)]
backtesting_files = [file for file in all_files if "Backtesting_results_window_level (" in file]
backtesting_files = [file.replace(".csv","") for file in backtesting_files]
version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in backtesting_files]
max_date = max(version_dates)
max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
req_file_name = [x for x in backtesting_files if max_date in x]
backtesting_results_file_path = os.path.join(algo_path,req_file_name[0] + ".csv")
# print(backtesting_results_file_path)

In [0]:
# Reading the results of backtesting
df = pd.read_csv(backtesting_results_file_path)
df = df[df['status'] == 'success']
# display(df)

In [0]:
# Roll up the metrics at Modeling granularity x Hyperparameter space x window level
window_level = modeling_granularity_conf + list(hyperparameters_conf.keys()) + ["window"]
# print(window_level)
window_level_results = df.groupby(window_level)[["mape","wmape","bias","tracking_signal","mae","rmse"]].min().reset_index()

In [0]:
# Roll up the metrics at Modeling granularity x Hyperparameter space level
hyperparam_level = modeling_granularity_conf + list(hyperparameters_conf.keys())
hyperparam_level_results = window_level_results.groupby(hyperparam_level)[["mape","wmape","bias","tracking_signal","mae","rmse"]].mean().reset_index()

In [0]:
hyperparam_level_results.to_csv(algo_path + "/Backtesting_results_hyperparameter_level (" + datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+").csv", index = False)
logger.info("Exported backtesting results")

#### Pandas UDF for getting best hyperparameters & MLFlow tracking

In [0]:
hyperparam_level_results['algorithm'] = 'Prophet'
hyperparam_level_results['result_type'] = 'backtesting'

In [0]:
def get_best_hyperparameters(final_hyperparam_df: pd.DataFrame) -> pd.DataFrame:
    """Function for getting the best hyperparameters results for each modeling granularity along with MLFlow tracking using the broadcasted information from the config file

    Parameters
    ----------
    final_hyperparam_df : pd.DataFrame
        Dataframe containing the results of Backtesting 

    Returns
    -------
    pd.DataFrame
        Dataset with best hyperparameter set for each modeling granularity
    """
    try:
        results_pd = {}

        # Get the modeling granularity
        broadcast_gran = broadcast_granularity.value
        # Get the experiment id
        tracking_value = broadcast_tracking.value.copy()

        # To adhere to defined schema
        for x in broadcast_gran:
            results_pd[x] = final_hyperparam_df[x].astype(str).iloc[0]

        hp_config = broadcast_hyper_parameters.value
        granularity = broadcast_granularity.value
        
        if tracking_value["tracking_needed"] == True:
            if tracking_value["type"] != "Managed":
                if tracking_value["tracking_uri"] is not None:
                    mlflow.set_tracking_uri("file:" + tracking_value["tracking_uri"])
                    experiment_id = mlflow.set_experiment(tracking_value["mlflow_experiment_id"])
                    tracking_value['mlflow_experiment_id'] = experiment_id.experiment_id

            # Add MLFlow code here
            with mlflow.start_run(experiment_id=tracking_value["mlflow_experiment_id"]):
                for x in ["algorithm",'result_type'] + broadcast_gran:
                    mlflow.log_param(x, final_hyperparam_df[x].iloc[0])
                for row in range(0, len(final_hyperparam_df)):
                    with mlflow.start_run(experiment_id=tracking_value["mlflow_experiment_id"], nested=True):
                        for x in hp_config:
                            mlflow.log_param(x, final_hyperparam_df[x].iloc[row])
                        for x in ["mape","wmape","bias","tracking_signal","mae","rmse"]:
                            mlflow.log_metric(x, final_hyperparam_df[x].iloc[row])

        metric = broadcast_metric.value

        # Sort using the metric of interest
        if metric in ["wmape", "mape", "mad", "mae", "rmse"]:
            final_hyperparam_df = final_hyperparam_df.sort_values(
                metric, ascending=True
            )
        elif metric in ["tracking_signal","bias"]:
            final_hyperparam_df.sort_values(metric, ascending=True,key=abs, inplace=True)
        else:
            final_hyperparam_df.sort_values(metric, ascending=True, inplace=True)

        # Adding the best hyperparameter and related metrics
        for x in hp_config:
            results_pd[x] = final_hyperparam_df[x].iloc[0]

        for x in ["mape", "wmape", "bias", "tracking_signal", "mae", "rmse"]:
            results_pd[x] = final_hyperparam_df[x].iloc[0]

        results_pd["status"] = "success"
        return pd.DataFrame.from_dict([results_pd])

    except Exception as e:
        results_pd = pd.DataFrame(
            columns=[["mape", "wmape", "bias", "tracking_signal", "mae", "rmse"]+ list(broadcast_hyper_parameters.value.keys())+ ["status"]+ broadcast_granularity.value],\
            index=range(1))
        results_pd[broadcast_granularity.value] = final_hyperparam_df[broadcast_granularity.value].head(1).reset_index(drop=True)
        for x in broadcast_granularity.value:
            results_pd[x] = results_pd[x].astype(str)
        results_pd["status"] = str(e)
        return results_pd

In [0]:
hyperparam_level_results['gran_tempp'] = hyperparam_level_results[modeling_granularity_conf].astype(str).sum(axis=1)
unique_pdts = hyperparam_level_results['gran_tempp'].unique()
best_hyperparam_results = pd.DataFrame()
for pdt in unique_pdts:
    best_hyperparam_results = pd.concat([best_hyperparam_results,get_best_hyperparameters(hyperparam_level_results[hyperparam_level_results['gran_tempp']==pdt])])

#### Writing the best hyperparameter results

In [0]:
best_hyperparam_results.to_csv(algo_path + "/Best_hyperparameters (" + datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+").csv", index = False)
logger.info("Exported best hyperparameter results")

In [0]:
# Exporting config file
config_file_name = "config_for_exp_id_"+str(broadcast_tracking.value['mlflow_experiment_id']) + " (" +datetime.today().strftime('%Y-%m-%d-%H-%M-%S-%f')[:-3]+").yml"
config_path1 = os.path.join(config_path,config_file_name)
with open(config_path1, 'w') as file:
    yaml.dump(temp_config, file, default_flow_style=False,sort_keys=False)

In [0]:
# Move from tmp directory to req. location in datalake
import platform
plat_sys = platform.system()

if(plat_sys!='Windows'):
    log_file = log_file.replace(' (', '\ \(').replace(')','\)')
    os.system('mv /tmp/{0} {1}'.format(log_file,logs_path))