In [0]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
import glob
import yaml

In [0]:
%run ./0_Config.ipynb

In [0]:
# Create the algo directory for storing the results
output_directory = os.path.join(app_config['output_dir_path'],"Modeling_Results")
root_dir = "Consolidated_Results/"
final_path = os.path.join(output_directory,root_dir)
if not os.path.exists(final_path):
    os.makedirs(final_path)
    print(final_path)

In [0]:
# All forecaster algorithms
algorithms = ['ElasticNet','ExponentialSmoothingHolt','ExponentialSmoothingHoltWinters','Lasso_cvglmnet','Prophet',\
              'SARIMAX','SimpleExponentialSmoothing','XGBoost','DeepAR','DeepState','LSTM']

In [0]:
out_results_fin = pd.DataFrame()

for algo in algorithms:
    algo_path = os.path.join(output_directory,algo)
    if glob.glob(algo_path):
        all_files1 = [file for file in os.listdir(algo_path) if file]
        out_files = [file for file in all_files1 if "Out_of_sample_evaluation_results (" in file]
        if(len(out_files)>0):
            out_files = [file.replace(".csv","") for file in out_files]
            version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in out_files]
            max_date = max(version_dates)
            max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
            req_file_name = [x for x in out_files if max_date in x]
            out_results_file_path = os.path.join(algo_path,req_file_name[0] + ".csv")
            out_results = pd.read_csv(out_results_file_path)
            out_results = out_results[out_results["status"] == "success"]
            # Renaming "Intercept" and "Expected_value" as base
            out_results.rename(columns = {'Intercept':'base', 'expected_value':'base'}, inplace = True)
            print(algo,out_results.shape)
            out_results_fin = pd.concat([out_results_fin,out_results], ignore_index = True)
        else:
            print("No Out_of_sample_evaluation_results for "+algo)
    else:
        print(algo_path+" >>> does not exists")
            
    
# Renaming ds to the column name for date thats provided in the config
out_results_fin.rename(columns = {'ds':app_config['date_var']}, inplace = True)
out_results_fin[app_config['date_var']] = pd.to_datetime(out_results_fin[app_config['date_var']])

if('Coeffs' in out_results_fin.columns):
    del(out_results_fin['Coeffs'])
# display(out_results_fin)

In [0]:
# Reading the latest input file based on timestamp
all_files = [file for file in os.listdir(app_config['output_dir_path']+"/Data_Processing/Missing_value_treatment")]
missing_op_files = [file for file in all_files if "Missing_value_treatment_results (" in file]
missing_op_files = [file.replace(".csv","") for file in missing_op_files]
version_dates = [datetime.strptime(x.split('(')[1].replace(')',''), '%Y-%m-%d-%H-%M-%S') for x in missing_op_files]
max_date = max(version_dates)
max_date = max_date.strftime('%Y-%m-%d-%H-%M-%S')
req_file_name = [x for x in missing_op_files if max_date in x]
missing_op_file_path = os.path.join(app_config['output_dir_path']+"/Data_Processing/Missing_value_treatment",req_file_name[0] + ".csv")
# print(missing_op_file_path)

# Reading the data
raw_data = pd.read_csv(missing_op_file_path)
print("Overall Count",raw_data.shape)

temp_idvs = []
all_algos = [algo for algo in app_config['Algorithms'].keys() if algo not in ['DeepAR','DeepState']]
for algo in all_algos:
    if("exogenous_variables" in app_config['Algorithms'][algo].keys()):
        temp_idvs = temp_idvs + (app_config['Algorithms'][algo]['exogenous_variables']['positive_corr']+\
                                 app_config['Algorithms'][algo]['exogenous_variables']['negative_corr']+\
                                 app_config['Algorithms'][algo]['exogenous_variables']['uncertain_corr'])
for algo in ['DeepAR','DeepState']:
    temp_idvs = temp_idvs + (app_config['Algorithms'][algo]['exogenous_variables']['feat_dynamic_real'])

idvs = list(set(temp_idvs))
common_cols = list(set(raw_data.columns).intersection(out_results_fin.columns) - set(app_config['modeling_granularity'] + [app_config['date_var']] + idvs))
_xCols_raw = [x for x in raw_data.columns if x.endswith("_x") or x.endswith("_X") ]
_xCols_raw = list(set(_xCols_raw) - set(idvs))
_xCols_conso = [x for x in out_results_fin.columns if x.endswith("_x") or x.endswith("_X") ]
_xCols_conso = list(set(_xCols_conso) - set(idvs))

# Renaming the independent variables with a suffix "_x" in raw data order to understand these are the IDVs
for var in idvs:
    raw_data.rename(columns = {var:var+"_x"}, inplace = True)
    
# Renaming the "_x" or "_X" variables with "x" or "X" so that it wouldn't clash with independent variables having suffix "_x"
for var in _xCols_raw:
    raw_data.rename(columns = {var:var.replace("_x","x").replace("_X","X")}, inplace = True)
    
for var in _xCols_conso:
    out_results_fin.rename(columns = {var:var.replace("_x","x").replace("_X","X")}, inplace = True)
    
# Renaming the independent variables with a suffix "_contribution" in model results in order to understand these are the IDVs 
for var in idvs:
    out_results_fin.rename(columns = {var:var+"_contribution"}, inplace = True)
        
# Renaming common columns if exists with "_raw"
for var in common_cols:
    raw_data.rename(columns = {var:var+"_raw"}, inplace = True)

In [0]:
print("Overall Count of Model Results",out_results_fin.shape)

# Converting raw_data date column to timestamp to match the format with the consolidated results file
raw_data[app_config['date_var']] = pd.to_datetime(raw_data[app_config['date_var']])

# Merging model results output with the raw data to get all the columns
overall_df = out_results_fin.merge(raw_data, on = app_config['modeling_granularity'] + [app_config['date_var']], how = "left")
print("Overall Count after merging",overall_df.shape)

# creating experiment id column
overall_df['experiment_id'] = app_config['tracking']['mlflow_experiment_id']

In [0]:
# Exporting final output file
overall_df.to_csv(final_path+"/Consolidated_results ("+datetime.today().strftime('%Y-%m-%d-%H-%M-%S')+").csv", index = False)