In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import re

import os
from glob import glob
from tqdm import tqdm

import yaml
from yaml import dump
import uuid
import itertools

In [3]:
import warnings
warnings.simplefilter(action="ignore")

In [4]:
def get_paths(models_list):
    '''
    Finds all the paths to forecasts and experiments metadata (directories /forecast/ and /wf_result/)
    
    Returns list with paths to forecast files, dict with metadata and list of all the experiment names
    '''
    
    uuids = []
    model_names = []
    train_start_or_duration = []
    hyperparameters = []
    features = []
    n_models = []
    
    paths_to_predictions = []
    paths_to_info = []

    forecast_paths = []
    metadata_paths = []


    for model in models_list:
        paths_to_predictions += glob(f'/masters_diploma/forecast/{model}/research_task_*/{model}_*/')
        paths_to_info += glob(f'/masters_diploma/wf_result/{model}/research_task_*')

    print(len(paths_to_predictions))
    print(len(paths_to_info))
            
    for path2 in paths_to_info:   
#     for path2 in [max(paths_to_info, key=os.path.getctime)]:   # тільки для останнього експерименту
        metadata_paths.extend(glob(os.path.join(path2, '*.csv')))


    for path2 in paths_to_predictions:
#     for path2 in [max(paths_to_predictions, key=os.path.getctime)]:   # тільки для останнього експерименту
        prediction_paths = glob(os.path.join(path2, f'*.csv'))
        if len(prediction_paths) > 0:
            forecast_paths.append(prediction_paths)

    yaml_file_paths = [f.replace('.csv', '.yaml') for f in metadata_paths]

    metadata = {}
    experiment_names = []
    for file in yaml_file_paths: 
        with open(file, 'r') as f:
            res = yaml.safe_load(f)

        shorten_uuid = "-".join([res['unique_uuid'].split('-')[0], res['unique_uuid'].split('-')[-2]])
        dur = res['duration_training_history'] if 'duration_training_history' in res else res['train_start']

        metadata[shorten_uuid] = {
            "uuid": res['unique_uuid'],
            "model_name": res['model_name'],
            "duration/train_start": dur,
            "hyperparameters": res['model_hyperparameters'],
            "features": res['train_features']
        }

        experiment_names.append(f"{res['model_name']}_{shorten_uuid}")


    # for file in yaml_file_paths: 
    #     with open(file, 'r') as f:
    #         res = yaml.safe_load(f)
    #     uuids.append(res['unique_uuid'])
    #     model_names.append(res['model_name'])
    #     hyperparameters.append(res['model_hyperparameters'])
    #     features.append(res['train_features'])
    #     if 'duration_training_history' in res:
    #         train_start_or_duration.append(res['duration_training_history'])
    #     else:
    #         train_start_or_duration.append(res['train_start'])
     
    # shorten_uuids = ["-".join([n.split('-')[0], n.split('-')[-2]]) for n in uuids]
    # metadata_lst = list(zip(uuids, shorten_uuids, model_names, train_start_or_duration, hyperparameters, features, n_models))           
    
    # file_paths_splited = [metadata_paths[k].split('-') for k in range(len(metadata_paths))]
    # model_name = [file_paths_splited[k][-5].split('\\')[1] for k in range(len(metadata_paths))]
    # experiment_names = ["-".join([model_name[k], file_paths_splited[k][-2]]) for k in range(len(metadata_paths))]     
    

    return forecast_paths, metadata_paths, metadata, experiment_names

In [5]:
def facts(path_to_all):

    dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d %H:%M:%S')
    path_to_weather = f'{path_to_all}/processed_data/history_weather.csv'

    fact_temperature = pd.read_csv(
        path_to_weather,
        parse_dates=['date'],
        index_col='date', 
        date_parser=dateparse
    )[['temperature']]
    
    fact_temperature.index.name = 'date_time'

    return fact_temperature

In [6]:
def make_forecasts_df(fact_pred, paths_to_exp_forecasts, exp_name):

    '''
    Creating a dataframe of forecasted temperature values
    '''

    dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d %H:%M:%S')
    
    df = fact_pred.copy()
    
    for num_exp, day_pred in enumerate(paths_to_exp_forecasts):
        d = day_pred.split('_')[-4]
        day_date = day_pred.split('\\')[-1].split('_')[-1].split(')')[0].split('(')[1]
#         print(day_date)

        pred = pd.read_csv(
            day_pred,
            parse_dates=['date_time'],
            index_col='date_time', 
            date_parser=dateparse
        )
        
        for h in range(24):
            try:

                df.loc[pd.to_datetime(day_date) + timedelta(hours=h), f'{exp_name}_{d}'] = pred.loc[pd.to_datetime(day_date) + timedelta(hours=h),'0']
            
            except KeyError as e:
                
                print(day_pred)
                continue
                

    return df

In [7]:
def get_stat(fact_pred, info, day, path_to_files):
    
    forecast_cols = [col for col in fact_pred.columns if day in col]
    df = fact_pred[['temperature'] + forecast_cols].dropna()
    
    absolute_errors = df[forecast_cols].sub(df['temperature'], axis=0)
    
    relative_errors = absolute_errors.div(df['temperature'], axis=0)
    
    print(absolute_errors.index)
     
    stat = pd.DataFrame({
        'exp_name': absolute_errors.columns,
        'mean_abs_value': absolute_errors.abs().mean(),
        'mean_rel_value': relative_errors.abs().mean(),
        'median_abs_value': absolute_errors.abs().median(),
        'median_rel_value': relative_errors.abs().median(),
        'q25_abs_value': absolute_errors.abs().quantile(0.25),
        'q25_rel_value': relative_errors.abs().quantile(0.25),
        'q75_abs_value': absolute_errors.abs().quantile(0.75),
        'q75_rel_value': relative_errors.abs().quantile(0.75)
    })
        
    stat_per_h = pd.DataFrame(relative_errors.abs().groupby(df.index.hour).median(), columns=absolute_errors.columns)
    
    
    path = os.path.join(path_to_files, 'statistics', f'general_statistics_{day}.xlsx')
    path_h = os.path.join(path_to_files, 'statistics', f'general_statistics_{day}_by_hour.xlsx')
    
    if os.path.exists(path):
    
        gen_stat_df = pd.read_excel(path)
        gen_stat_h_df = pd.read_excel(path_h)
        gen_stat_df = pd.concat([gen_stat_df, stat]).drop_duplicates()
        gen_stat_h_df = pd.concat([gen_stat_h_df, stat_per_h]).drop_duplicates()

        gen_stat_df.to_excel(path, index=False)
        gen_stat_h_df.to_excel(path_h, index=False)
        
    else:
        stat.to_excel(path, index=False)
        stat_per_h.to_excel(path_h, index=False)
    
    return stat, stat_per_h

In [8]:
# def get_best_models_per_hour(stat_per_h, day):
#     pass

#     return best_models

# from pandas import DataFrame

# def get_best_models_per_hour(stat_per_h: dict, day: str, metadata_dict: dict) -> DataFrame:
#     """
#     Створює DataFrame з найкращими моделями по годинах за відносною помилкою.

#     Parameters:
#     - stat_per_h: словник {exp_name: DataFrame} з колонкою помилок по годинах
#     - day: назва дня ('d-0', 'd-1' тощо), щоб правильно вибрати стовпець помилок
#     - metadata_dict: словник {exp_name: metadata_dict} з інформацією про експерименти

#     Returns:
#     - DataFrame з індексом 0–23 (години) і колонками:
#       ['experiment_name', 'relative_error', 'model_name', 'hyperparameters', 'features', 'train_start']
#     """
#     hours = range(24)
#     rows = []

#     for h in hours:
#         best_exp = None
#         min_error = float('inf')

#         # Знаходимо експеримент з мінімальною помилкою для години h
#         for exp_name, df in stat_per_h.items():
#             try:
#                 error = df.loc[h, f'rel_error_{day}']
#                 if error < min_error:
#                     min_error = error
#                     best_exp = exp_name
#             except KeyError:
#                 continue  # пропускаємо, якщо немає такої години чи колонки

#         if best_exp is not None:
#             meta = metadata_dict.get(best_exp, {})
#             rows.append({
#                 "hour": h,
#                 "experiment_name": best_exp,
#                 "relative_error": min_error,
#                 "model_name": meta.get("model_name"),
#                 "hyperparameters": meta.get("hyperparameters"),
#                 "features": meta.get("features"),
#                 "train_start": meta.get("train_start") or meta.get("period"),
#             })

#     result_df = DataFrame(rows).set_index("hour")
#     return result_df

In [9]:
path_to_all = '/masters_diploma/'
models_list = ['xgboost', 'random_forest', 'lightgbm']

paths, metadata_paths, metadata_dict, exp_names = get_paths(models_list)

fact_temperature = facts(path_to_all)
fact_pred = fact_temperature.copy()


for key, metadata in metadata_dict.items():
#     print(key, metadata)
    
    exp_name = f"{metadata['model_name']}_{key}"
    
    for exp_forecasts in paths:
        
        k = exp_forecasts[0].split("\\")[-2].split('-')
        exp = "-".join([k[0], k[-2]])
        
        if exp == exp_name:

            fact_pred = make_forecasts_df(fact_pred, exp_forecasts, exp)
#             print(len(fact_pred.columns))
        else:
            continue

fact_pred = fact_pred.loc['2025-01-01':'2025-01-08']

for d in range(4):
    stat, stat_per_h = get_stat(fact_pred, metadata, f'd-{d}', path_to_all)
#     best_models_df = get_best_models_per_hour(stat_per_h, f'd-{d}')


164
3
DatetimeIndex(['2025-01-01 00:00:00', '2025-01-01 01:00:00',
               '2025-01-01 02:00:00', '2025-01-01 03:00:00',
               '2025-01-01 04:00:00', '2025-01-01 05:00:00',
               '2025-01-01 06:00:00', '2025-01-01 07:00:00',
               '2025-01-01 08:00:00', '2025-01-01 09:00:00',
               ...
               '2025-01-08 14:00:00', '2025-01-08 15:00:00',
               '2025-01-08 16:00:00', '2025-01-08 17:00:00',
               '2025-01-08 18:00:00', '2025-01-08 19:00:00',
               '2025-01-08 20:00:00', '2025-01-08 21:00:00',
               '2025-01-08 22:00:00', '2025-01-08 23:00:00'],
              dtype='datetime64[ns]', name='date_time', length=192, freq=None)
DatetimeIndex(['2025-01-02 00:00:00', '2025-01-02 01:00:00',
               '2025-01-02 02:00:00', '2025-01-02 03:00:00',
               '2025-01-02 04:00:00', '2025-01-02 05:00:00',
               '2025-01-02 06:00:00', '2025-01-02 07:00:00',
               '2025-01-02 08:00:00', '20

In [11]:
stat_per_h

Unnamed: 0_level_0,xgboost_430dfb8f-a2d2_d-3,xgboost_430dfb90-afb0_d-3,xgboost_430dfb91-8c7d_d-3,xgboost_430dfb92-afe1_d-3,xgboost_430dfb93-9937_d-3,xgboost_430dfb94-bc1a_d-3,xgboost_430dfb95-bcb1_d-3,xgboost_430dfb96-9f74_d-3,xgboost_430dfb97-a74a_d-3,xgboost_430dfb98-9194_d-3,...,lightgbm_430dfc01-becc_d-3,lightgbm_430dfc02-a822_d-3,lightgbm_430dfc03-931f_d-3,lightgbm_430dfc04-8666_d-3,lightgbm_430dfc05-a5bb_d-3,lightgbm_430dfc06-9a61_d-3,lightgbm_430dfc07-977c_d-3,lightgbm_430dfc08-a22e_d-3,lightgbm_430dfc09-98ec_d-3,lightgbm_430dfc0a-9a63_d-3
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.472578,0.920645,1.538025,0.928705,1.696636,0.958427,1.792398,0.963922,1.820766,0.962648,...,1.007709,1.03748,1.004349,1.034938,1.067432,1.089094,1.018244,1.031411,1.00753,1.02551
1,0.621114,0.530321,0.612536,0.457966,0.410522,0.499515,0.432768,0.436072,0.25102,0.526162,...,1.001464,1.009343,1.00059,1.005655,1.060665,1.061971,1.001702,1.006343,1.00036,1.004223
2,0.855175,0.405471,0.947625,0.457966,0.739265,0.499515,0.670195,0.433486,0.617802,0.324905,...,1.001956,1.004187,1.001305,1.002746,1.057016,1.057111,1.001889,1.001728,1.000429,1.000834
3,0.81885,0.23303,0.688237,0.228043,0.771364,0.256091,0.803362,0.304949,0.955414,0.363646,...,1.001024,1.001365,1.000368,1.00033,1.052869,1.052957,1.001128,1.001059,1.000176,1.000254
4,0.202408,0.337626,0.144043,0.248295,0.249587,0.216148,0.151564,0.311355,0.162655,0.443534,...,1.001473,1.001272,1.000337,1.000251,1.049285,1.049367,1.001273,1.000987,1.000195,1.000111
5,0.064794,0.222539,0.139142,0.196542,0.171325,0.243555,0.189728,0.246602,0.169736,0.311528,...,1.001308,1.001211,1.000409,1.000238,1.0469,1.046978,1.001359,1.000939,1.000237,1.000106
6,0.3474,0.253846,0.42689,0.251771,0.337107,0.203128,0.251575,0.129145,0.246243,0.142403,...,1.001267,1.001172,1.000818,1.000205,1.045434,1.04551,1.00111,1.001036,1.000298,1.000147
7,0.445108,0.321473,0.36989,0.358657,0.420603,0.290468,0.47825,0.41514,0.354111,0.215064,...,1.0011,1.00125,1.000499,1.000148,1.048463,1.048544,1.001014,1.001105,1.000154,1.000099
8,0.641961,0.624224,0.604347,0.489171,0.372656,0.532146,0.808986,0.469321,0.887259,0.217304,...,1.001474,1.001996,1.00058,1.000314,1.058468,1.057416,1.001583,1.001917,1.000247,1.000215
9,0.534872,1.123331,0.531811,1.278769,0.548749,1.428598,0.616215,1.325724,0.665959,1.534194,...,1.001746,1.002697,1.000751,1.000423,1.07901,1.077589,1.002024,1.00259,1.000427,1.000287
