In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np

from glob import glob
import yaml
import uuid
import os
import matplotlib.pyplot as plt

from datetime import datetime, timedelta

import warnings
warnings.filterwarnings("ignore")

In [3]:
def get_paths(location, models_list, brigade):
    
    '''
    Функція знаходить шляхи до файлів з результатами (directory /wfv_result/) для кожної бригади та моделі
    
    Повертає список з гіперпараметрами кожної моделі по кожній бригаді (model_parameters_pred), 
    а також список з шляхами до усіх файлів з результатами усіх експериментів по бригаді (file_paths)
    
    '''

    uuids = []
    model_names = []
    train_start_or_duration = []
    hyperparameters = []
    features = []
    n_models = []
    
    paths_to_predictions = []
    paths_to_info = []

    file_paths_to_prediction = []
    file_paths_to_info = []
    
    for model in models_list:
        if location == 'vpf':
            paths_to_predictions += glob(f'/datalake/mhp/{location}/gr/{brigade}/ecf/forecast/{model}/research_task_*/{model}_*')
            paths_to_info += glob(f'/datalake/mhp/{location}/gr/{brigade}/ecf/wfv_result/{model}/research_task_*')
        elif location == 'mpf':
            paths.extend(glob(f'/datalake/mhp/{location}/gr/ecf/wfv_result/{model}/research_task_*'))
            
#     print(paths_to_predictions, paths_to_info)

    for path2 in paths_to_info:   
#     for path2 in [max(paths_to_info, key=os.path.getctime)]:   # тільки для останнього експерименту
        file_paths_to_info.extend(glob(os.path.join(path2, '*.csv')))


    for path2 in paths_to_predictions:
#     for path2 in [max(paths_to_predictions, key=os.path.getctime)]:   # тільки для останнього експерименту
        prediction_paths = glob(os.path.join(path2, f'*_{location}_d2_*.txt'))
        if len(prediction_paths) > 0:
            file_paths_to_prediction.append(prediction_paths)

    yaml_file_paths = [f.replace('.csv', '.yaml') for f in file_paths_to_info]


    for file in yaml_file_paths: 
        with open(file, 'r') as f:
            res = yaml.safe_load(f)
        if ('standard_weight' in res['train_features']) and ('total_weight' in res['train_features']) and ('chickens_in_house' in res['train_features']):
            uuids.append(res['unique_uuid'])
            model_names.append(res['model_name'])
            hyperparameters.append(res['model_hyperparameters'])
            features.append(res['train_features'])
            if 'duration_training_history' in res:
                train_start_or_duration.append(res['duration_training_history'])
            else:
                train_start_or_duration.append(res['train_start'])
        else:
            continue
           
        
    info_files = []
    forecast_files = []
    for idx in range(len(uuids)):
        exp = f"{model_names[idx]}_{uuids[idx]}"
        for path_idx in range(len(file_paths_to_info)):
            if exp in file_paths_to_info[path_idx]:
                info_files.append(file_paths_to_info[path_idx])
                forecast_files.append(file_paths_to_prediction[path_idx])
                
    file_paths_to_info = info_files
    file_paths_to_prediction = forecast_files
                
          
    for info_file in file_paths_to_info:
        _df = pd.read_csv(info_file)
        if f'd2_abs_error_hour_00' in _df.columns:
            n_models.append(f"24 (hourly)")    
        else: 
            n_models.append(f"1 (daily)")

    shorten_uuids = ["-".join([n.split('-')[0], n.split('-')[-2]]) for n in uuids]

    model_parameters = list(zip(uuids, shorten_uuids, model_names, train_start_or_duration, hyperparameters, features, n_models))           


    file_paths_splited = [file_paths_to_info[k].split('-') for k in range(len(file_paths_to_info))]
    model_name = [file_paths_splited[k][-5].split('\\')[1] for k in range(len(file_paths_to_info))]
    experiment_names = ["-".join([model_name[k], file_paths_splited[k][-2]]) for k in range(len(file_paths_to_info))]     

    return file_paths_to_prediction, file_paths_to_info, yaml_file_paths, model_parameters, experiment_names

In [4]:
def calculate_pred_cons(location, brigade, file_paths_to_prediction, experiment_names):
    
    '''
    Тут створюється датафрейм прогнозованим споживанням для бригади по годинам (дні по індексу, години по колонкам)
    
    '''
    
    dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d %H:%M:%S')
    path_to_consumption = f'/datalake/mhp/{location}/gr/{brigade}/ecf/processed_data/hour_consumption_{brigade}.csv'

    consumption = pd.read_csv(
        path_to_consumption,
        parse_dates=['date_time'],
        index_col='date_time', 
        date_parser=dateparse
    )
    consumption.columns=['hour_consumption']
    
    
    df_preds = consumption.copy()
    
    for num_exp, exp_paths in enumerate(file_paths_to_prediction):
        for d, day_pred in enumerate(exp_paths):
            exp_name = experiment_names[num_exp]
            day_date = day_pred.split('\\')[-1].split('_')[-2]
#             print(brigade, day_date)
            
            with open(day_pred, 'r') as file:
                predictions_by_day = np.array([int(k) for k in file.readlines()[0].split(':')[2:-1]])

            for h in range(24):
                df_preds.loc[pd.to_datetime(day_date) + timedelta(hours=h), f'{exp_name}'] = predictions_by_day[h]
                   
    df_preds = df_preds.dropna()
#     print(df_preds)
            
    df_preds = df_preds.loc['2022-01-01':'2022-10-19'].drop(df_preds[df_preds['hour_consumption'].isna()].index)
    print(df_preds)
    
    return df_preds, df_preds.shape[0]

In [5]:
def form_pred_consumption_df(df_preds, len_df_pred, exp_name):

    df_pred_consumption = pd.DataFrame()

    for i in range(0, len_df_pred, 24):
        for h in range(24):
            try:
                df_pred_consumption.loc[df_preds.index[i].date(), [f"prediction_{h:02d}"]] = df_preds.loc[df_preds.index[i]+timedelta(hours=h), f'{exp_name}']
            except KeyError:
                df_pred_consumption.loc[df_preds.index[i].date(), [f"prediction_{h:02d}"]] = 0
            

    df_pred_consumption = df_pred_consumption.drop(df_pred_consumption[df_pred_consumption==0].dropna().index)
    
    return df_pred_consumption

In [6]:
def calculate_real_cons_df(location, brigade, df_preds, len_df_pred):
    
    '''
    Тут створюється датафрейм фактичним споживанням для бригади по годинам (дні по індексу, години по колонкам)
    
    '''

    df_real_consumption = pd.DataFrame()

    for i in range(0, len_df_pred, 24):
        for h in range(24):
            try:
                df_real_consumption.loc[df_preds.index[i].date(), [f"real_consumption_{h:02d}"]] = df_preds.loc[df_preds.index[i]+timedelta(hours=h), 'hour_consumption']
            except KeyError:
                df_real_consumption.loc[df_preds.index[i].date(), [f"real_consumption_{h:02d}"]] = 0
    

    return df_real_consumption

In [7]:
def get_errors_dataframe(exp_num, df_pred_consumption, df_real_consumption, df_pred_median, df_pred_mean, df_pred_q25, df_pred_q75, file_paths_to_info, model_parameters, exp_name):
    
    '''
    Функція приймає повернуті значення попередньої функції (набір параметрів та шляхи до файлів)
    Повертає датафрейми з: середнім значенням, медіаною, 25-% та 75-% квантилі абсолютної та відносної помилок для кожного експерименту
    Якщо експеримент був погодинний (24 моделі на день), вищезазначені значення помилок вказуються для кожної години
    
    P.S. у набір параметрів додається значення кількості моделей на день (якщо прогноз поденний - 1 модель, якщо погодинний - 24 моделі) 
    
    '''    
    
    pred = df_pred_consumption.values
    orig = df_real_consumption.values

    abs_errors = abs(orig-pred)
    relative_errors = abs(pred / orig - 1) * 100

    abs_err_df = pd.DataFrame(abs_errors, columns=[f'hour_{h:02d}' for h in range(24)])
    relative_err_df = pd.DataFrame(relative_errors, columns=[f'hour_{h:02d}' for h in range(24)])
        
        
    df_pred_median.loc[exp_num, 'idx'] = exp_name
    df_pred_mean.loc[exp_num, 'idx'] = exp_name
    df_pred_q25.loc[exp_num, 'idx'] = exp_name
    df_pred_q75.loc[exp_num, 'idx'] = exp_name

    df_pred_median.loc[exp_num, f'd2_mean_abs_error'] = abs_err_df.median().values.mean()
    df_pred_median.loc[exp_num, f'd2_mean_relative_error'] = relative_err_df.median().values.mean()

    df_pred_mean.loc[exp_num, f'd2_mean_abs_error'] = abs_err_df.mean().values.mean()
    df_pred_mean.loc[exp_num, f'd2_mean_relative_error'] = relative_err_df.mean().values.mean()

    df_pred_q25.loc[exp_num, f'd2_mean_abs_error'] = abs_err_df.quantile(0.25).values.mean()
    df_pred_q25.loc[exp_num, f'd2_mean_relative_error'] = relative_err_df.quantile(0.25).values.mean()

    df_pred_q75.loc[exp_num, f'd2_mean_abs_error'] = abs_err_df.quantile(0.75).values.mean()
    df_pred_q75.loc[exp_num, f'd2_mean_relative_error'] = relative_err_df.quantile(0.75).values.mean()
    
    return df_pred_mean, df_pred_median, df_pred_q25, df_pred_q75

In [13]:
def final_df_vpf_best_exp_by_day(location, brigade, models_list, df_pred_mean, df_pred_median, df_pred_q25, df_pred_q75, model_parameters, sum_brigades_statistics_uuid):
    
    '''
    Тут формується датафрейм з результатами по двом найкращим експериментам по бригаді (по одному експерименту на кожну модель)
    Ці дві моделі для кожної бригади також передаються у текстовий файл best_models_by_brigade.txt, 
    який буде далі використовуватись при складанні графіків сумарного споживання усіх бригад
    
    '''

    df_final_best_vpf_by_day = pd.DataFrame()
    
    exp_uuid = []
    exp_model = []
    train_start_or_duration = []
    model_hyperparameters = []
    train_features = []
    n_models_per_day = []

    idx = []
    

    for n_model, model in enumerate(models_list):
        
        experiment_names = []
        
        for model_idx in df_pred_median.index:
            if model in str(model_idx):
                experiment_names.append(model_idx)
            
        
        best_median_pred_model = df_pred_median.loc[experiment_names, ['d2_mean_abs_error', 'd2_mean_relative_error']]
        
        best_model_experimental = best_median_pred_model[best_median_pred_model == best_median_pred_model.min()].T.iloc[1].dropna().index[0]
        best_mean_value_experimental = df_pred_mean.loc[best_model_experimental, ['d2_mean_abs_error', 'd2_mean_relative_error']]
        best_median_value_experimental = df_pred_median.loc[best_model_experimental, ['d2_mean_abs_error', 'd2_mean_relative_error']]
        best_q25_value_experimental = df_pred_q25.loc[best_model_experimental, ['d2_mean_abs_error', 'd2_mean_relative_error']]
        best_q75_value_experimental = df_pred_q75.loc[best_model_experimental, ['d2_mean_abs_error', 'd2_mean_relative_error']]

        
        shorten_best_model_uuid = best_model_experimental.split('_')[-1]
        for n in model_parameters:
            if shorten_best_model_uuid==n[1]:
                exp_uuid = n[0]
                exp_model = n[2]
                train_start_or_duration = n[3]
                model_hyperparameters = n[-3]
                train_features = n[-2]
                n_models_per_day = n[-1]
                
        idx = f"{model}-{shorten_best_model_uuid}"
        


        df_final_best_vpf_by_day.loc[n_model, 'idx'] = idx
        df_final_best_vpf_by_day.loc[n_model, f'mean_abs_value_{len_df_pred}'] = np.around(best_mean_value_experimental['d2_mean_abs_error'], 4)
        df_final_best_vpf_by_day.loc[n_model, f'mean_relative_value_{len_df_pred}'] = np.around(best_mean_value_experimental['d2_mean_relative_error'], 4)

        df_final_best_vpf_by_day.loc[n_model, 'median_abs_value'] = np.around(best_median_value_experimental['d2_mean_abs_error'], 4)
        df_final_best_vpf_by_day.loc[n_model, 'median_relative_value'] = np.around(best_median_value_experimental['d2_mean_relative_error'], 4)

        df_final_best_vpf_by_day.loc[n_model, 'q25_abs_value'] = np.around(best_q25_value_experimental['d2_mean_abs_error'], 4)
        df_final_best_vpf_by_day.loc[n_model, 'q25_relative_value'] = np.around(best_q25_value_experimental['d2_mean_relative_error'], 4)

        df_final_best_vpf_by_day.loc[n_model, 'q75_abs_value'] = np.around(best_q75_value_experimental['d2_mean_abs_error'], 4)
        df_final_best_vpf_by_day.loc[n_model, 'q75_relative_value'] = np.around(best_q75_value_experimental['d2_mean_relative_error'], 4)

        df_final_best_vpf_by_day.loc[n_model, 'exp_model'] = exp_model
        df_final_best_vpf_by_day.loc[n_model, 'n_models_per_day'] = n_models_per_day
        df_final_best_vpf_by_day.loc[n_model, 'train_start/duration'] = train_start_or_duration
        df_final_best_vpf_by_day.loc[n_model, 'model_hyperparameters'] = f"{model_hyperparameters}"
        df_final_best_vpf_by_day.loc[n_model, 'train_features'] = f"{train_features}"
        df_final_best_vpf_by_day.loc[n_model, 'exp_uuid'] = exp_uuid
        

    df_final_best_vpf_by_day = df_final_best_vpf_by_day.set_index('idx')
    
    with open(os.path.join(f'/datalake/mhp/{location}/gr/sum_brigades/statistical_result', 
                           f'best_models_by_brigade_{sum_brigades_statistics_uuid}.txt'), 'a') as file:
        for i in range(df_final_best_vpf_by_day.shape[0]):
            md = df_final_best_vpf_by_day.iloc[i]
            file.writelines(f"{brigade}-{md['exp_model']}: {md['exp_model']}_{md['exp_uuid']}\n")

    df_final_best_vpf_by_day.to_csv(os.path.join(f'/datalake/mhp/{location}/gr/{brigade}/ecf/statistical_result', 
                                    f'(best_model_by_day)_statistical_result_{location}_{brigade}_{datetime.now().strftime("%Y%m%d")}.csv'))

    return df_final_best_vpf_by_day

In [14]:
def final_df_vpf_all_exp_by_day(location, brigade, df_pred_mean, df_pred_median, df_pred_q25, df_pred_q75, model_parameters):
    
    '''
    Тут формується датасет зі значеннями помилок усіх експериментів по бригадам

    '''

    exp_uuid = []
    exp_model = []
    train_start_or_duration = []
    model_hyperparameters = []
    train_features = []
    n_models_per_day = []

    models = df_pred_mean['d2_mean_abs_error'].index.dropna()
    print(models)

    shorten_best_model_uuid = [n.split('_')[-1] for n in models]

    #     print(shorten_best_model_uuid)
    for m in shorten_best_model_uuid:  
        for n in model_parameters:
            if m==n[1]:
                exp_uuid.append(n[0])
                exp_model.append(n[2])
                train_start_or_duration.append(n[3])
                model_hyperparameters.append(n[-3])
                train_features.append(n[-2])
                n_models_per_day.append(n[-1])

    print(len(model_parameters), df_pred_mean.loc[models, 'd2_mean_abs_error'].dropna().shape[0])


    df_final_vpf_all = pd.DataFrame({
        'idx': np.array(models),
        f'mean_abs_value_{len_df_pred//24}': np.around(df_pred_mean.loc[models, 'd2_mean_abs_error'].dropna(), 4),
        f'mean_relative_value_{len_df_pred//24}': np.around(df_pred_mean.loc[models, 'd2_mean_relative_error'].dropna(), 4),

        'median_abs_value': np.around(df_pred_median.loc[models, 'd2_mean_abs_error'].dropna(), 4),
        'median_relative_value': np.around(df_pred_median.loc[models, 'd2_mean_relative_error'].dropna(), 4),

        'q25_abs_value': np.around(df_pred_q25.loc[models, 'd2_mean_abs_error'].dropna(), 4),
        'q25_relative_value': np.around(df_pred_q25.loc[models, 'd2_mean_relative_error'].dropna(), 4),

        'q75_abs_value': np.around(df_pred_q75.loc[models, 'd2_mean_abs_error'].dropna(), 4),
        'q75_relative_value': np.around(df_pred_q75.loc[models, 'd2_mean_relative_error'].dropna(), 4),

        'exp_model': exp_model,
        'n_models_per_day': n_models_per_day,
        'model_hyperparameters': model_hyperparameters,
        'train_start/duration': train_start_or_duration,
        'train_features': train_features,
        'exp_uuid': exp_uuid,
    })

    df_final_vpf_all = df_final_vpf_all.set_index('idx')
    df_final_vpf_all.to_csv(os.path.join(os.getcwd(), f'/datalake/mhp/{location}/gr/{brigade}/ecf/statistical_result', 
            f'(all_models)_statistical_result_{location}_{brigade}_{datetime.now().strftime("%Y%m%d")}.csv'))

    return df_final_vpf_all

In [1]:
location = 'vpf'
brigades_list = ['brigade_13', 'brigade_14', 'brigade_49']
# brigades_list = ['brigade_1', 'brigade_2', 'brigade_3', 'brigade_4', 'brigade_5', 'brigade_6', 
#                 'brigade_7', 'brigade_8', 'brigade_9', 'brigade_10', 'brigade_11', 'brigade_12', 
#                 'brigade_13', 'brigade_14', 'brigade_22', 'brigade_42', 'brigade_43', 'brigade_47', 'brigade_49']
models_list = ['random_forest', 'xgboost']
sum_brigades_statistics_uuid = str(uuid.uuid1())


df_pred_median = pd.DataFrame()
df_pred_mean = pd.DataFrame()
df_pred_q25 = pd.DataFrame()
df_pred_q75 = pd.DataFrame()

    
if location == 'vpf':
    for brigade in brigades_list:
        print(brigade)
        file_paths_to_prediction, file_paths_to_info, yaml_file_paths, model_parameters, experiment_names = get_paths(location, models_list,  brigade)
        
        df_preds, len_df_pred = calculate_pred_cons(location, brigade, file_paths_to_prediction, experiment_names)
        df_real_consumption = calculate_real_cons_df(location, brigade, df_preds, len_df_pred)
        for exp_num, exp_name in enumerate(experiment_names):
            df_pred_consumption = form_pred_consumption_df(df_preds, len_df_pred, exp_name)
            df_pred_mean, df_pred_median, df_pred_q25, df_pred_q75 = get_errors_dataframe(exp_num, df_pred_consumption, df_real_consumption, df_pred_median, df_pred_mean, df_pred_q25, df_pred_q75, file_paths_to_info, model_parameters, exp_name)

        df_pred_median = df_pred_median.set_index('idx')
        df_pred_mean = df_pred_mean.set_index('idx')
        df_pred_q25 = df_pred_q25.set_index('idx')
        df_pred_q75 = df_pred_q75.set_index('idx')
            
        
        df_final_best_vpf_by_day = final_df_vpf_best_exp_by_day(location, brigade, models_list, df_pred_mean, df_pred_median, df_pred_q25, df_pred_q75, model_parameters, sum_brigades_statistics_uuid)
        df_final_vpf_all = final_df_vpf_all_exp_by_day(location, brigade, df_pred_mean, df_pred_median, df_pred_q25, df_pred_q75, model_parameters)

        
elif location == 'mpf':
    model_parameters, file_paths = get_paths(location, models_list, brigade=None)
    df_pred_mean, df_pred_median, df_pred_q25, df_pred_q75, len_df_pred = get_dataframes(file_paths)
        
    df_final = final_df_mpf(location, df_pred_mean, df_pred_median, df_pred_q75, df_orig_mean, df_orig_median, df_orig_q75, len_df_pred, len_df_orig, model_parameters)