In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import re

import os
from glob import glob
from tqdm import tqdm

import yaml
from yaml import dump
import uuid
import itertools

In [3]:
import warnings
warnings.simplefilter(action="ignore")

In [4]:
def get_paths(models_list):
    '''
    Finds all the paths to forecasts and experiments metadata (directories /forecast/ and /wf_result/)
    
    Returns list with paths to forecast files, dict with metadata and list of all the experiment names
    '''
    
    uuids = []
    model_names = []
    train_start_or_duration = []
    hyperparameters = []
    features = []
    n_models = []
    
    paths_to_predictions = []
    paths_to_info = []

    forecast_paths = []
    metadata_paths = []


    for model in models_list:
        paths_to_predictions += glob(f'/masters_diploma/forecast/{model}/research_task_*/{model}_*/')
        paths_to_info += glob(f'/masters_diploma/wf_result/{model}/research_task_*')

#     print(len(paths_to_predictions))
#     print(len(paths_to_info))
            
    for path2 in paths_to_info:   
#     for path2 in [max(paths_to_info, key=os.path.getctime)]:   # тільки для останнього експерименту
        metadata_paths.extend(glob(os.path.join(path2, '*.csv')))


    for path2 in paths_to_predictions:
#     for path2 in [max(paths_to_predictions, key=os.path.getctime)]:   # тільки для останнього експерименту
        prediction_paths = glob(os.path.join(path2, f'*.csv'))
        if len(prediction_paths) > 0:
            forecast_paths.append(prediction_paths)

    yaml_file_paths = [f.replace('.csv', '.yaml') for f in metadata_paths]

    metadata = {}
    experiment_names = []
    for file in yaml_file_paths: 
        with open(file, 'r') as f:
            res = yaml.safe_load(f)

        shorten_uuid = "-".join([res['unique_uuid'].split('-')[0], res['unique_uuid'].split('-')[-2]])
        dur = res['duration_training_history'] if 'duration_training_history' in res else res['train_start']

        metadata[shorten_uuid] = {
            "uuid": res['unique_uuid'],
            "model_name": res['model_name'],
            "duration/train_start": dur,
            "hyperparameters": res['model_hyperparameters'],
            "features": res['train_features']
        }

        experiment_names.append(f"{res['model_name']}_{shorten_uuid}")   
    

    return forecast_paths, metadata_paths, metadata, experiment_names

In [5]:
def facts(path_to_all):

    dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d %H:%M:%S')
    path_to_weather = f'{path_to_all}/processed_data/history_weather.csv'

    fact_temperature = pd.read_csv(
        path_to_weather,
        parse_dates=['date'],
        index_col='date', 
        date_parser=dateparse
    )[['temperature']]
    
    fact_temperature.index.name = 'date_time'

    return fact_temperature

In [6]:
def make_forecasts_df(fact_pred, paths_to_exp_forecasts, exp_name):

    '''
    Creating a dataframe of forecasted temperature values
    '''

    dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d %H:%M:%S')
    
    df = fact_pred.copy()
    
    for num_exp, day_pred in enumerate(paths_to_exp_forecasts):
        d = day_pred.split('_')[-4]
        day_date = day_pred.split('\\')[-1].split('_')[-1].split(')')[0].split('(')[1]
#         print(day_date)

        pred = pd.read_csv(
            day_pred,
            parse_dates=['date_time'],
            index_col='date_time', 
            date_parser=dateparse
        )
        
        for h in range(24):
            try:

                df.loc[pd.to_datetime(day_date) + timedelta(hours=h), f'{exp_name}_{d}'] = pred.loc[pd.to_datetime(day_date) + timedelta(hours=h),'0']
            
            except KeyError as e:
                
                print(day_pred)
                continue
                

    return df

In [7]:
def get_stat(fact_pred, info, day, path_to_files):
    
    forecast_cols = [col for col in fact_pred.columns if day in col]
    df = fact_pred[['temperature'] + forecast_cols].dropna()
    
    df.columns = df.columns.str.replace(r'_d-\d+$', '', regex=True)

    
    absolute_errors = df[df.columns[1:]].sub(df['temperature'], axis=0)
    
    relative_errors = absolute_errors.div(df['temperature'], axis=0)
    
    print(absolute_errors.columns)
    
    stat_dict = {}
    for exp in info.keys():
        exp_name = f"{info[exp]['model_name']}_{exp}"
        
        stat_dict[exp] = {
            'mean_abs_value': absolute_errors[exp_name].abs().mean(),
            'mean_rel_value': relative_errors[exp_name].abs().mean(),
            'median_abs_value': absolute_errors[exp_name].abs().median(),
            'median_rel_value': relative_errors[exp_name].abs().median(),
            'q25_abs_value': absolute_errors[exp_name].abs().quantile(0.25),
            'q25_rel_value': relative_errors[exp_name].abs().quantile(0.25),
            'q75_abs_value': absolute_errors[exp_name].abs().quantile(0.75),
            'q75_rel_value': relative_errors[exp_name].abs().quantile(0.75),
            "model_name": info[exp]["model_name"],
            "hyperparameters": info[exp]["hyperparameters"],
            "features": info[exp]["features"],
            "train_start": info[exp]["duration/train_start"]
        }
        
    stat = pd.DataFrame(stat_dict).T
        
    stat_per_h = pd.DataFrame(relative_errors.abs().groupby(df.index.hour).median(), columns=relative_errors.columns)
    
    
    path = os.path.join(path_to_files, 'statistics', f'general_statistics_{day}.xlsx')
    path_h = os.path.join(path_to_files, 'statistics', f'general_statistics_{day}_by_hour.xlsx')
    
    if os.path.exists(path):
    
        gen_stat_df = pd.read_excel(path)
        gen_stat_h_df = pd.read_excel(path_h)
        gen_stat_df = pd.concat([gen_stat_df, stat]).drop_duplicates()
        gen_stat_h_df = pd.concat([gen_stat_h_df, stat_per_h]).drop_duplicates()

        gen_stat_df.to_excel(path, index=False)
        gen_stat_h_df.to_excel(path_h, index=False)
        
    else:
        stat.to_excel(path, index=False)
        stat_per_h.to_excel(path_h)
    
    return stat, stat_per_h

In [8]:
def get_best_models_per_hour(stat_per_h, day, info, path_to_files):
    
    min_errors = stat_per_h_df.min(axis=1)
    best_exps = stat_per_h_df.idxmin(axis=1)
    
    
    best_models = pd.concat([best_exps, min_errors], axis=1)
    best_models.columns=['experiment_name', 'median_rel_err_value']
    
    for h in best_models.index:
        exp = best_models.loc[h, 'experiment_name']
        meta = info[exp]
        
        best_models.loc[best_models['experiment_name']==exp, 'model_name'] = meta['model_name']
        best_models.loc[best_models['experiment_name']==exp, 'hyperparameters'] = meta['hyperparameters']
        best_models.loc[best_models['experiment_name']==exp, 'features'] = meta['features']
        best_models.loc[best_models['experiment_name']==exp, 'duration/train_start'] = meta['duration/train_start']
        
        
    path = get_next_versioned_filename(os.path.join(path_to_files, 'statistics', 'best_for_hour'), day)
    best_models.to_excel(path)
    
    
    return best_models

In [9]:
def get_next_versioned_filename(base_dir, day, prefix="hourly_best", ext=".xlsx"):
    today = datetime.today().strftime("%Y-%m-%d")
    pattern = re.compile(rf"{prefix}_{day}_{today}_v(\d+){re.escape(ext)}")
    
    # Отримаємо всі файли в директорії, які відповідають шаблону
    existing_versions = []
    for filename in os.listdir(base_dir):
        match = pattern.match(filename)
        if match:
            existing_versions.append(int(match.group(1)))
    
    next_version = max(existing_versions, default=0) + 1
    new_filename = f"{prefix}_{today}_v{next_version}{ext}"
    
    
    return os.path.join(base_dir, new_filename)

In [10]:
path_to_all = '/masters_diploma/'
models_list = ['xgboost', 'random_forest', 'lightgbm']

print('gathering experiment info...')
paths, metadata_paths, metadata_dict, exp_names = get_paths(models_list)

print('loading fact temperature dataset...')
fact_temperature = facts(path_to_all)
fact_pred = fact_temperature.copy()

print('adding experiments` forecasts...')

for key, metadata in tqdm(metadata_dict.items()):
#     print(key, metadata)
    
    exp_name = f"{metadata['model_name']}_{key}"
    
    for exp_forecasts in paths:
        
        k = exp_forecasts[0].split("\\")[-2].split('-')
        exp = "-".join([k[0], k[-2]])
        
        if exp == exp_name:

            fact_pred = make_forecasts_df(fact_pred, exp_forecasts, exp)
#             print(len(fact_pred.columns))
        else:
            continue

fact_pred = fact_pred.loc['2025-01-01':'2025-01-08']

for d in range(4):
    print(f'\ncalculating statistics for day {d}...')
    stat, stat_per_h = get_stat(fact_pred, metadata_dict, f'd-{d}', path_to_all)
    print('finished')
    print('finding best model for hour...')
    best_models_df = get_best_models_per_hour(stat_per_h, f'd-{d}', metadata_dict)
    print('done\n')


gathering experiment info...
loading fact temperature dataset...
adding experiments` forecasts...


100%|████████████████████████████████████████████████████████████████████████████████| 164/164 [02:08<00:00,  1.27it/s]



calculating statistics for day 0...
Index(['xgboost_430dfb8f-a2d2', 'xgboost_430dfb90-afb0',
       'xgboost_430dfb91-8c7d', 'xgboost_430dfb92-afe1',
       'xgboost_430dfb93-9937', 'xgboost_430dfb94-bc1a',
       'xgboost_430dfb95-bcb1', 'xgboost_430dfb96-9f74',
       'xgboost_430dfb97-a74a', 'xgboost_430dfb98-9194',
       'xgboost_430dfb99-aade', 'xgboost_430dfb9a-9978',
       'xgboost_430dfb9b-8c23', 'xgboost_430dfb9c-b9a9',
       'xgboost_430dfb9d-a005', 'xgboost_430dfb9e-88ad',
       'xgboost_430dfb9f-82fb', 'xgboost_430dfba0-b867',
       'xgboost_430dfba1-b784', 'xgboost_430dfba2-938a',
       'xgboost_430dfba3-9c9c', 'xgboost_430dfba4-a408',
       'xgboost_430dfba5-9fd9', 'xgboost_430dfba6-950a',
       'xgboost_430dfba7-b07c', 'xgboost_430dfba8-9e68',
       'xgboost_430dfba9-822f', 'xgboost_430dfbaa-aecb',
       'xgboost_430dfbab-be7a', 'xgboost_430dfbac-93a3',
       'xgboost_430dfbe1-ac10', 'xgboost_430dfbe2-80eb',
       'xgboost_430dfbe3-8e5d', 'xgboost_430dfbe4-8

KeyError: 'random_forest_430dfbb9-a577'

In [None]:
metadata_dict