### Objective:
The objective of the notebook is to -
* Build the final model of LSTM algorithm using the best hyperparameter set (identified using Backtesting) and score the test set to get a performance metric
* For forecasting future periods, we will re-train the model with the same hyperparameter set on the train + validation + test set to capture the patterns in the test set and then forecast future N periods

In [0]:
import yaml
import inspect
import glob
import numpy as np
import pandas as pd
from distutils.command.config import config
from tqdm.auto import tqdm
from datetime import timedelta
from datetime import datetime
import mlflow
from sklearn.metrics import mean_absolute_error,mean_squared_error
import os
import logging
import dotsi
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM,TimeDistributed,RepeatVector

In [0]:
# logging part
p_dir = "/tmp/"
log_file = "LSTM_model_eval_retraining_scoring" + " (" +datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+ ").log"

logger = logging.getLogger('custom_log')
logger.setLevel(logging.DEBUG)

# Applying necessary formatter
fh = logging.FileHandler(p_dir+log_file)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)

In [0]:
# Getting the default settings of hyperparameters. Used to check that user-provided hyperparameters must always be a subset of these.
def get_default_args(func) -> dict:
    """Function to get the default values of the hyperparameters for the given algorithm

    Parameters
    ----------
    func : constructor of the respective algorithm
        The name of the algorithm (Eg: Prophet,SARIMAX)

    Returns
    -------
    dict
        returns a dictionary of hyperparameters and the corresponding default values for the given algorithm
    """
    signature = inspect.signature(func)
    return {
        k: v.default if v.default is not inspect.Parameter.empty else None
        for k, v in signature.parameters.items()
        if k != 'self'
    }
    
default_hpps_compile = get_default_args(Sequential.compile)
default_hpps_fit = get_default_args(Sequential.fit)

In [0]:
%run ../../../0_Config.ipynb

In [0]:
logger.info("Config file read")
assert set(app_config["Algorithms"]["LSTM"]["Hyperparameters"]['compile'].keys()).\
           issubset(set(default_hpps_compile.keys())),\
           'keys supplied by the user for the LSTM Algorithm under comiple method must be valid'
assert set(app_config["Algorithms"]["LSTM"]["Hyperparameters"]['fit'].keys()).\
           issubset(set(default_hpps_fit.keys())),\
           'keys supplied by the user for the LSTM Algorithm under fit method must be valid'

# For exporting the config file
temp_config = app_config.copy()

In [0]:
def frange(start,stop,step= 1):
    l = []
    i = start
    while(i < stop):
        l.append(round(i,len(str(step))))
        i = i+step
    return l

def drange(hyperparameters):
    l=[]
    for key in hyperparameters.keys():
        val = hyperparameters[key]
        if 'range' in val:
            val = val.replace('range','frange')
            new_str = 'total_list = '  + val
            _locals = locals()
            exec(new_str,globals(),_locals)
            without_dup = list(set(_locals['total_list']))
            hyperparameters[key] = without_dup
    return hyperparameters

In [0]:
fit_ = drange(app_config['Algorithms']['LSTM']['Hyperparameters']['fit'])
compile_ = drange(app_config['Algorithms']['LSTM']['Hyperparameters']['compile'])
for key in compile_.keys():
    if(key in fit_.keys()):
        fit_[key] = list(set(fit_[key]+compile_[key]))
    else:
        fit_[key] = list(compile_[key])
    
fit_new = {}
for key in fit_.keys():
    temp = []
    for val in fit_[key]:
        if(type(val) == list):
            val = str(val)
        if((val!='None') and (val!='Null') and (val!=None)):
            temp.append(val)
    if(len(temp)>0):
        fit_new[key] = temp
        
if('kwargs' in fit_new.keys()):
    del fit_new['kwargs']
    
for val in ['x','y','validation_data','kwargs']:
    if(val in fit_new.keys()):
        del fit_new[val]
        
app_config["Algorithms"]["LSTM"]["Hyperparameters"] = fit_new

In [0]:
# Create the algo directory for storing the results
output_directory = app_config['output_dir_path']
root_dir = "Modeling_Results"
algorithm = "LSTM"
algo_path = os.path.join(output_directory,root_dir,algorithm)
if not os.path.exists(algo_path):
    os.makedirs(algo_path)
logger.info("Created algorithm directory")    

logs_path = os.path.join(output_directory,root_dir,'logs',algorithm)
if not os.path.exists(logs_path):
    os.makedirs(logs_path)
logger.info("Created logs directory")

config_path = os.path.join(app_config['output_dir_path'],"Modeling_Results","config")
if not os.path.exists(config_path):
    os.makedirs(config_path)
logger.info("Created config directory")

In [0]:
hyperparameters_conf = dict(app_config["Algorithms"]["LSTM"]["Hyperparameters"])
# print(hyperparameters_conf)

modeling_granularity_conf = app_config["modeling_granularity"]
# print(modeling_granularity_conf)

# Rename Start date and DV config
dv_config = app_config["dependent_variable"]
ds_config = app_config["date_var"]

# pos and neg corr broadcast
corr_config = dict(app_config['Algorithms']['LSTM']['exogenous_variables'])
corr_config_broadcast = dotsi.Dict({"value":corr_config})

# Eval metric broadcast
broadcast_metric = dotsi.Dict({"value":app_config['validation']['metric']})
broadcast_test_periods = dotsi.Dict({"value":app_config["validation"]["no_of_test_periods"]})

broadcast_regressors = dotsi.Dict({"value":list(set(corr_config['positive_corr']+corr_config['negative_corr']+corr_config['uncertain_corr']))})
broadcast_granularity = dotsi.Dict({"value":modeling_granularity_conf})
broadcast_hyper_parameters = dotsi.Dict({"value":hyperparameters_conf})
broadcast_forecast_periods = dotsi.Dict({"value":app_config["Algorithms"]["LSTM"]["forecast_periods"]})
broadcast_lookback_periods = dotsi.Dict({"value":app_config["Algorithms"]["LSTM"]["lookback_periods"]})
broadcast_tracking = dotsi.Dict({"value":app_config['tracking']})
mlflow_tracking_check = dotsi.Dict({"value":"Out of Sample"})
logger.info("Broadcasted the required variables")

In [0]:
# Reading the latest file based on timestamp
all_files = [file for file in os.listdir(algo_path)]
best_hyp_files = [file for file in all_files if "Best_hyperparameters (" in file]
best_hyp_files = [file.replace(".csv","") for file in best_hyp_files]
version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in best_hyp_files]
max_date = max(version_dates)
max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
req_file_name = [x for x in best_hyp_files if max_date in x]
best_hyp_param_results_file_path = os.path.join(algo_path,req_file_name[0]+".csv")
print(best_hyp_param_results_file_path)

best_hyperparam_results = pd.read_csv(best_hyp_param_results_file_path)
best_hyperparam_results = best_hyperparam_results[best_hyperparam_results['status']=='success'].reset_index(drop = True)
best_hyperparam_results[modeling_granularity_conf] = best_hyperparam_results[modeling_granularity_conf].astype(str)
best_hyperparam_results.replace(['true'],True, inplace = True)
best_hyperparam_results.replace(['false'],False, inplace = True)
best_hyperparam_results_broadcast = dotsi.Dict({"value":best_hyperparam_results})
logger.info("Read the best hyperparamter results")
best_hyperparam_results

Unnamed: 0,Div_No,Store_No,Base_UPC,epochs,batch_size,verbose,shuffle,loss,optimizer,mape,wmape,bias,tracking_signal,mae,rmse,status
0,24,15,4000046410,50,32,2,False,mae,adam,107.415744,61.084682,-0.071489,-0.463355,4.145606,4.828644,success


In [0]:
# Reading feature selected output and using the significant variables as idvs in modeling
feature_selection_info = app_config['Algorithms']['LSTM']['feature_selection']
broadcast_use_features = dotsi.Dict({"value":feature_selection_info['use_feature_selected_idvs']})
if(feature_selection_info['use_feature_selected_idvs']):
    if(feature_selection_info['approach']=='lasso_cvglmnet'):
        output_folder = app_config['output_dir_path']+"/Feature_Selection/Lasso/"
    # Reading the latest input file based on timestamp
    coeff_op_files = [file for file in os.listdir(output_folder)]
    coeff_op_files = [file.replace(".csv","") for file in coeff_op_files]
    version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in coeff_op_files]
    max_date = max(version_dates)
    max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
    req_file_name = [x for x in coeff_op_files if max_date in x]
    coeff_op_file_path = os.path.join(output_folder,req_file_name[0] + ".csv")
    print(coeff_op_file_path)

    # Reading the data
    coeff_df = pd.read_csv(coeff_op_file_path)
    coeff_df = coeff_df[coeff_df['status']=='success']
    # print(coeff_df.shape)
    coeff_df[modeling_granularity_conf] = coeff_df[modeling_granularity_conf].astype(str)
    idvs_len = len(feature_selection_info['must_have_idvs'])
    if(idvs_len>0):
        temp1 = coeff_df[modeling_granularity_conf].drop_duplicates()
        temp1['temp'] = 1
        temp2 = pd.DataFrame({'IDV':feature_selection_info['must_have_idvs']})
        temp2['temp'] = 1
        temp = temp1.join(temp2, on = 'temp', how ='left')
        req_cols = modeling_granularity_conf + ['IDV']
        coeff_df = coeff_df.drop_duplicates()
    coeffs_broadcast = dotsi.Dict({"value":coeff_df})
    broadcast_regressors = dotsi.Dict({"value":list(coeff_df['IDV'].unique())})
# display(coeff_df)

In [0]:
def get_forecast_UDF(df_data: pd.DataFrame)-> pd.DataFrame:
    """Function to perform final model building using the train data and score on the test data utilizing the broadcasted details from the config file

    Parameters
    ----------
    df_data : pd.DataFrame
        The dataset containing values for all the required variables

    Returns
    -------
    pd.DataFrame
        Returns a dataframe with the granularity,date,independent variables contributions if any and performance metrics for the training and the testing set
    """
    try:
        df_data = df_data.sort_values(by=['ds'],ascending=True)
        hpt = best_hyperparam_results_broadcast.value
        
        # broadcast_granularity
        broadcast_gran = broadcast_granularity.value
        
        # get best hyperparameters for the given modeling granularity
        for x in list(broadcast_gran):
            hpt = hpt[hpt[x] == df_data[x].iloc[0]]
            
        # number of test periods to look
        lookback_period = broadcast_lookback_periods.value
        forecast_period = broadcast_forecast_periods.value
        test_periods1 = int(broadcast_test_periods.value)
        test_periods = test_periods1 - forecast_period + 1 
        
        if(broadcast_use_features.value==True):
            # Reading regressors from feature selection
            coeffs_df = coeffs_broadcast.value
            for x in broadcast_gran:
                coeffs_df = coeffs_df[coeffs_df[x] == df_data[x].iloc[0]]
            regressors = list(coeffs_df['IDV'].values)
        else:
            # Appending regressors based on the sign of correlation
            corr_var = corr_config_broadcast.value
            regressors = list(set(corr_var["positive_corr"] + corr_var["negative_corr"]+corr_var['uncertain_corr']))

            temp_list1 = []
            # Removing regressors based on the correlation
            if(corr_var["consider_correlation"]):   
                for x in corr_var["positive_corr"]:
                    if(df_data[['y',x]].corr().iloc[0][1]<0):
                        temp_list1.append(x)
                for x in corr_var["negative_corr"]:
                    if (x not in temp_list1):
                        if(df_data[['y',x]].corr().iloc[0][1]>0):
                            temp_list1.append(x)   
                regressors = list(set(regressors) - set(temp_list1))

            # Checking for variance in the regressor
            temp_list2 = []
            if len(regressors)>0:
                for ex_var in regressors:  
                    mean = df_data[ex_var].mean()
                    std = df_data[ex_var].std()
                    if mean == 0:
                        if std <= 0.001:
                            temp_list2.append(ex_var)
                    else:
                        if abs(std/mean) <= 0.01:
                            temp_list2.append(ex_var)

            regressors = list(set(regressors) - set(temp_list2))
            temp_list = temp_list1 + temp_list2
        
        # filtering for the required data
        data = df_data[['y']+regressors].astype('float32')
        values = data.values
        n_vars = len(regressors) + 1
        cols, names = list(), list()

        # input sequence (t-n, ... t-1)
        for i in range(lookback_period, 0, -1):
            cols.append(data.shift(i))
            names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]

        # forecast sequence (t, t+1, ... t+n)
        for i in range(0, forecast_period):
            cols.append(data.shift(-i))
            if i == 0:
                names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
            else:
                names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]

        # put it all together
        agg = pd.concat(cols, axis=1)
        agg.columns = names

        # drop rows with NaN values
        if True:
            agg.dropna(inplace=True)

        # reframed = agg.copy()
        cols_to_drop = [col for col in agg.columns if ("var1" not in col) & (("(t)" in col) | ("(t+" in col))]
        agg.drop(cols_to_drop, axis=1, inplace=True)

        values = agg.values
        train_val = values[:values.shape[0]-test_periods, :]
        test_val = values[values.shape[0]-test_periods:, :]

        # split into input and outputs
        train_X, train_y = train_val[:, :-forecast_period], train_val[:, -forecast_period:]
        test_X, test_y = test_val[:, :-forecast_period], test_val[:, -forecast_period:]

        # reshape input to be 3D [samples, timesteps, features]
        train_X = train_X.reshape((train_X.shape[0], lookback_period, n_vars))
        test_X = test_X.reshape((test_X.shape[0], lookback_period, n_vars))

        # Updating the default arguments with the parameters provided in the config
        hp_config = broadcast_hyper_parameters.value
        def_args = get_default_args(Sequential.compile)
        for x in hp_config:
            if(x in def_args.keys()):
                temp_val = hpt[x].iloc[0]
                if(type(temp_val)==str):
                    if('[' in temp_val):
                        temp_val = eval(temp_val)
                def_args[x] = temp_val
        if('kwargs' in def_args.keys()):
            del def_args['kwargs']
            
        def_args_fit = get_default_args(Sequential.fit)
        for x in hp_config:
            if(x in def_args_fit.keys()):
                temp_val = hpt[x].iloc[0]
                if(type(temp_val)==str):
                    if('[' in temp_val):
                        temp_val = eval(temp_val)
                def_args_fit[x] = temp_val
        def_args_fit['x'] = train_X
        def_args_fit['y'] = train_y
        def_args_fit['validation_data'] = (test_X, test_y)
        
        # Calling the LSTM constructor with the hyperparameters of interest  
        tensorflow.keras.utils.set_random_seed(1)
        # design network
        model = Sequential()
        model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2]))) ## encoder 
        model.add(RepeatVector(forecast_period))
        model.add(LSTM(50, activation='relu', return_sequences=True)) ## decoder
        model.add(Dense(1))
        model.compile(**def_args)
        model.fit(**def_args_fit)
        
        ### predictions
        # train_predict=model.predict(train_X)
        test_predict=model.predict(test_X)
        
        # test period dates
        test_dates = df_data.iloc[-test_periods1:]['ds'].values
        test = pd.DataFrame()
        for window in range(len(test_predict)):
            test = pd.concat([test,pd.DataFrame({'ds':test_dates[window:forecast_period+window],\
                                                 'yhat':list(list(zip(*test_predict[window]))[0]),\
                                                 'window':np.repeat(window+1,forecast_period)})])
        test['test_flag'] = 1
        results_pd = pd.merge(test,df_data,how='left')
        results_pd = results_pd[broadcast_gran+['ds', 'y', 'yhat','test_flag','window']].reset_index(drop = True)
        # Sales or Quantity can't be negative hence
        results_pd["yhat"] = np.where(results_pd["yhat"]<0,0,results_pd["yhat"])
        
        # Eval. metrics calculation
        # to handle erroneous results epsilon is set to 1.
        epsilon = 1
        temp_data1 = pd.DataFrame(index= range(1))
        temp_data2 = pd.DataFrame()
        for window in results_pd['window'].unique():
            temp_data = results_pd[results_pd['window']==window]
            y_pred = temp_data['yhat']
            y_true = temp_data['y']

            temp_data1['window'] = window
            # Eval. metrics calculation
            temp_data1['mape'] = np.mean(np.abs(y_true - y_pred) / np.maximum(np.abs(y_true), epsilon))*100  
            temp_data1['wmape'] = np.sum(np.abs(y_true - y_pred)) / np.maximum(np.sum(np.abs(y_true)),epsilon)*100  
            temp_data1['bias'] = np.mean((y_true - y_pred))  
            temp_data1['tracking_signal'] = np.sum((y_true - y_pred)) / np.mean(np.abs(y_true - y_pred))
            temp_data1['mae'] = mean_absolute_error(y_true, y_pred)
            temp_data1['rmse']=np.sqrt(mean_squared_error(y_true, y_pred))
            temp_data2 = pd.concat([temp_data2,temp_data1],ignore_index = True)
            
        results_pd = pd.merge(results_pd,temp_data2,how='left',on='window')
        
        # To adhere to defined schema
        for x in broadcast_gran + ['window']:   
            results_pd[x] = results_pd[x].astype(str)

        # Append Hyperparameters used
        for x in hp_config:
            results_pd[x] = hpt[x].iloc[0]
        
        # Get the experiment id
        tracking_value = broadcast_tracking.value.copy()
        if(mlflow_tracking_check.value == "Out of Sample" and tracking_value["tracking_needed"] == True):
            if(tracking_value['type']!="Managed"):
                if(tracking_value['tracking_uri'] is not None):
                    mlflow.set_tracking_uri("file:"+tracking_value['tracking_uri'])
                    experiment_id = mlflow.set_experiment(tracking_value["mlflow_experiment_id"])
                    tracking_value['mlflow_experiment_id'] = experiment_id.experiment_id
            #Add MLFlow code here
            with mlflow.start_run(experiment_id = tracking_value['mlflow_experiment_id']):
                mlflow.log_param('algorithm', 'LSTM')
                mlflow.log_param('result_type', 'out_of_sample')
                for x in broadcast_gran:
                    mlflow.log_param(x, results_pd[x].iloc[0])
                for x in hp_config:
                    mlflow.log_param(x, results_pd[x].iloc[0])
                temp_test = results_pd[results_pd['test_flag']==1].reset_index(drop = True)
                for x in ["mape","wmape","bias","tracking_signal","mae","rmse"]:
                    mlflow.log_metric(x, temp_test[x].iloc[0])
                
        results_pd['status'] = 'success'
        return results_pd
    
    except Exception as e:
        results_pd = pd.DataFrame(columns = [['ds', 'y', 'yhat','mape','wmape','bias','tracking_signal','mae','rmse']+\
                                              list(broadcast_hyper_parameters.value.keys()) + ['status','test_flag','window'] + broadcast_granularity.value],index = range(1))
        results_pd[broadcast_granularity.value] = df_data[broadcast_granularity.value].head(1).reset_index(drop = True)
        for x in broadcast_granularity.value:
            results_pd[x] = results_pd[x].astype(str)
        results_pd['status'] = str(e)  
        return results_pd

#### Loading the latest Missing_value_treatment file
##### Please update the reading path with the required data path if "Missing value treatment" was not run

In [0]:
# Reading the latest input file based on timestamp
all_files = [file for file in os.listdir(app_config['output_dir_path']+"/Data_Processing/Missing_value_treatment")]
missing_op_files = [file for file in all_files if "Missing_value_treatment_results (" in file]
missing_op_files = [file.replace(".csv","") for file in missing_op_files]
version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in missing_op_files]
max_date = max(version_dates)
max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
req_file_name = [x for x in missing_op_files if max_date in x]
missing_op_file_path = os.path.join(app_config['output_dir_path']+"/Data_Processing/Missing_value_treatment",req_file_name[0]+'.csv')
# print(missing_op_file_path)

# Reading the data
df = pd.read_csv(missing_op_file_path)
# print(df.shape)

df.rename(columns = {ds_config:"ds", dv_config:"y"}, inplace = True)
df['ds'] = pd.to_datetime(df['ds'])

df[modeling_granularity_conf] = df[modeling_granularity_conf].astype(str)

logger.info("Data loaded")
# print(list(broadcast_hyper_parameters.value.keys()))

gbcp = list(modeling_granularity_conf)

df['gran_tempp'] = df[gbcp].astype(str).sum(axis=1)
unique_pdts = df['gran_tempp'].unique()
df_forecast = pd.DataFrame()
for pdt in unique_pdts:
    df_forecast = pd.concat([df_forecast,get_forecast_UDF(df[df['gran_tempp']==pdt])])
    
df_forecast.to_csv(algo_path+"/Out_of_sample_results_window_level ("+datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+").csv", index = False)
logger.info("Completed Backtesting")

In [0]:
# Reading the latest Out_of_sample_results_window_level file based on timestamp
all_files = [file for file in os.listdir(algo_path)]
backtesting_files = [file for file in all_files if "Out_of_sample_results_window_level (" in file]
backtesting_files = [file.replace(".csv","") for file in backtesting_files]
version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in backtesting_files]
max_date = max(version_dates)
max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
req_file_name = [x for x in backtesting_files if max_date in x]
backtesting_results_file_path = os.path.join(algo_path,req_file_name[0] + ".csv")
print(backtesting_results_file_path)

# Reading the results of backtesting
df = pd.read_csv(backtesting_results_file_path)
df = df[df["status"] == "success"]

df[modeling_granularity_conf] = df[modeling_granularity_conf].astype(str)
df['ds'] = pd.to_datetime(df['ds'])
    
# Roll up the data at Modeling granularity window level
df_hyperparameters = best_hyperparam_results[gbcp + list(hyperparameters_conf)]

# performance metrics
per_met = ['status',"test_flag","window","mape","wmape","bias","tracking_signal","mae","rmse"]
df_metrics = df[gbcp + per_met].drop_duplicates()
df_metrics1 = df_metrics.groupby(gbcp + ['test_flag','status'])[["mape","wmape","bias","tracking_signal","mae","rmse"]].mean().reset_index()

# Remaining columns
rem_cols = list(set(df.columns) - set(per_met+list(hyperparameters_conf))) + ['test_flag']
dot_cols = [col for col in df.columns if "." in col] #to handle "."s
for col in dot_cols:
    df.rename(columns = {col:col.replace(".","dot")}, inplace = True)
    rem_cols[rem_cols.index(col)] = col.replace(".","dot")
rem_df = df[rem_cols]


group_cols = gbcp + ['ds','test_flag']
agg_cols = list(set(rem_cols) - set(group_cols))
exprs = {x: "mean" for x in agg_cols}
rem_df1 = rem_df.groupby(group_cols).agg(exprs).reset_index()
temp_cols = [col[:-1] if 'avg(' in col else col for col in rem_df1.columns ]
temp_cols = [col.replace('avg(','') for col in temp_cols]
rem_df1.columns = temp_cols
for col in dot_cols:
    rem_df1.rename(columns = {col.replace(".","dot"):col.replace("dot",".")}, inplace = True)
                            
# combining all the data
df_forecast = rem_df1.merge(df_metrics1, on = gbcp + ['test_flag'], how='left')
df_forecast = df_forecast.merge(df_hyperparameters, on = gbcp , how='left')
df_forecast['algorithm'] = 'LSTM'

# exporting the results
df_forecast.to_csv(algo_path+"/Out_of_sample_evaluation_results ("+datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+").csv", index = False)
logger.info("Exported Out of sample evaluation results")

### Predicting future timeperiods
The following code assumes that the X-variables for the required future time periods are available for each modeling granularity

Uncomment the below cells if wants to predict the future, update the df respectively such that it contains entire historical data as well as idvs data for the required future forecast time periods

In [0]:
# broadcast_test_periods =  broadcast_variable_conf(4) # Provide the no. of timeperiods to forecast in the future

In [0]:
## Reading the latest input file based on timestamp
# all_files = [file for file in os.listdir(app_config['output_dir_path']+"/Data_Processing/Missing_value_treatment")]
# missing_op_files = [file for file in all_files if "Missing_value_treatment_results (" in file]
# missing_op_files = [file.replace(".csv","") for file in missing_op_files]
# version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in missing_op_files]
# max_date = max(version_dates)
# max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
# req_file_name = [x for x in missing_op_files if max_date in x]
# missing_op_file_path = os.path.join(app_config['output_dir_path']+"/Data_Processing/Missing_value_treatment",req_file_name[0]+'.csv')
## print(missing_op_file_path)

## Reading the data
# df = pd.read_csv(missing_op_file_path)
## print(df.shape)

# df.rename(columns = {ds_config:"ds", dv_config:"y"}, inplace = True)
# df['ds'] = pd.to_datetime(df['ds'])
# df[modeling_granularity_conf] = df[modeling_granularity_conf].astype(str)

# # Broadcasting again with the "Future forecast" value since we won't be tracking the future forecast results
# mlflow_tracking_check = broadcast_required_info("Future forecast")
# logger.info("Data which contains the future forecast periods is loaded")

# gbcp = list(modeling_granularity_conf)

In [0]:
# df['gran_tempp'] = df[gbcp].astype(str).sum(axis=1)
# unique_pdts = df['gran_tempp'].unique()
# df_forecast = pd.DataFrame()
# for pdt in unique_pdts:
#     df_forecast = pd.concat([df_forecast,get_forecast_UDF(df[df['gran_tempp']==pdt])])
            
# del(df_forecast['test_flag_agg'])
# df_forecast['algorithm'] = 'LSTM'

In [0]:
# df_forecast.to_csv(algo_path + "/Future_forecast_results ("+datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+").csv",index = False)
# logger.info("Exported future forecast results")

In [0]:
# Exporting config file
config_file_name = "config_for_exp_id_"+str(broadcast_tracking.value['mlflow_experiment_id']) + " (" +datetime.today().strftime('%Y-%m-%d-%H-%M-%S-%f')[:-3]+").yml"
config_path1 = os.path.join(config_path,config_file_name)
with open(config_path1, 'w') as file:
    yaml.dump(temp_config, file, default_flow_style=False,sort_keys=False)

In [0]:
# Move from tmp directory to req. location in datalake
import platform
plat_sys = platform.system()

if(plat_sys!='Windows'):
    log_file = log_file.replace(' (', '\ \(').replace(')','\)')
    os.system('mv /tmp/{0} {1}'.format(log_file,logs_path))