In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import re

import os
from glob import glob
from tqdm import tqdm

import yaml
from yaml import dump
import uuid
import itertools

In [3]:
import warnings
warnings.simplefilter(action="ignore")

In [4]:
def get_paths(models_list):
    '''
    Finds all the paths to forecasts and experiments metadata (directories /forecast/ and /wf_result/)
    
    Returns list with paths to forecast files, dict with metadata and list of all the experiment names
    '''
    
    uuids = []
    model_names = []
    train_start_or_duration = []
    hyperparameters = []
    features = []
    n_models = []
    
    paths_to_predictions = []
    paths_to_info = []

    forecast_paths = []
    metadata_paths = []


    for model in models_list:
        paths_to_predictions += glob(f'/masters_diploma/forecast/{model}/research_task_*/{model}_*/')
        paths_to_info += glob(f'/masters_diploma/wf_result/{model}/research_task_*')

    print(len(paths_to_predictions))
    print(len(paths_to_info))
            
    for path2 in paths_to_info:   
#     for path2 in [max(paths_to_info, key=os.path.getctime)]:   # тільки для останнього експерименту
        metadata_paths.extend(glob(os.path.join(path2, '*.csv')))


    for path2 in paths_to_predictions:
#     for path2 in [max(paths_to_predictions, key=os.path.getctime)]:   # тільки для останнього експерименту
        prediction_paths = glob(os.path.join(path2, f'*.csv'))
        if len(prediction_paths) > 0:
            forecast_paths.append(prediction_paths)

    yaml_file_paths = [f.replace('.csv', '.yaml') for f in metadata_paths]

    metadata = {}
    experiment_names = []
    for file in yaml_file_paths: 
        with open(file, 'r') as f:
            res = yaml.safe_load(f)

        shorten_uuid = "-".join([res['unique_uuid'].split('-')[0], res['unique_uuid'].split('-')[-2]])
        dur = res['duration_training_history'] if 'duration_training_history' in res else res['train_start']

        metadata[shorten_uuid] = {
            "uuid": res['unique_uuid'],
            "model_name": res['model_name'],
            "duration/train_start": dur,
            "hyperparameters": res['model_hyperparameters'],
            "features": res['train_features']
        }

        experiment_names.append(f"{res['model_name']}_{shorten_uuid}")


    # for file in yaml_file_paths: 
    #     with open(file, 'r') as f:
    #         res = yaml.safe_load(f)
    #     uuids.append(res['unique_uuid'])
    #     model_names.append(res['model_name'])
    #     hyperparameters.append(res['model_hyperparameters'])
    #     features.append(res['train_features'])
    #     if 'duration_training_history' in res:
    #         train_start_or_duration.append(res['duration_training_history'])
    #     else:
    #         train_start_or_duration.append(res['train_start'])
     
    # shorten_uuids = ["-".join([n.split('-')[0], n.split('-')[-2]]) for n in uuids]
    # metadata_lst = list(zip(uuids, shorten_uuids, model_names, train_start_or_duration, hyperparameters, features, n_models))           
    
    # file_paths_splited = [metadata_paths[k].split('-') for k in range(len(metadata_paths))]
    # model_name = [file_paths_splited[k][-5].split('\\')[1] for k in range(len(metadata_paths))]
    # experiment_names = ["-".join([model_name[k], file_paths_splited[k][-2]]) for k in range(len(metadata_paths))]     
    

    return forecast_paths, metadata_paths, metadata, experiment_names

In [5]:
def facts(path_to_all):

    dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d %H:%M:%S')
    path_to_weather = f'{path_to_all}/processed_data/history_weather.csv'

    fact_temperature = pd.read_csv(
        path_to_weather,
        parse_dates=['date'],
        index_col='date', 
        date_parser=dateparse
    )[['temperature']]
    
    fact_temperature.index.name = 'date_time'

    return fact_temperature

In [6]:
def make_forecasts_df(fact_pred, paths_to_exp_forecasts, exp_name):

    '''
    Creating a dataframe of forecasted temperature values
    '''

    dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d %H:%M:%S')
    
    df = fact_pred.copy()
    
    for num_exp, day_pred in enumerate(paths_to_exp_forecasts):
        d = day_pred.split('_')[-4]
        day_date = day_pred.split('\\')[-1].split('_')[-1].split(')')[0].split('(')[1]
#         print(day_date)

        pred = pd.read_csv(
            day_pred,
            parse_dates=['date_time'],
            index_col='date_time', 
            date_parser=dateparse
        )
        
        for h in range(24):
            try:

                df.loc[pd.to_datetime(day_date) + timedelta(hours=h), f'{exp_name}_{d}'] = pred.loc[pd.to_datetime(day_date) + timedelta(hours=h),'0']
            
            except KeyError as e:
                
                print(day_pred)
                continue
                

    return df

In [14]:
def get_stat(fact_pred, info, day, path_to_files):
    
    forecast_cols = [col for col in fact_pred.columns if f'_d-{day}' in col]
    df = fact_pred[['temperature'] + forecast_cols].dropna()
    
    absolute_errors = df[forecast_cols].sub(df['temperature'], axis=0)
    
    relative_errors = absolute_errors.div(df['temperature'], axis=0)
    
    print(absolute_errors.index)
     
    stat = pd.DataFrame({
        'mean_abs_value': absolute_errors.abs().mean(),
        'mean_rel_value': relative_errors.abs().mean(),
        'median_abs_value': absolute_errors.abs().median(),
        'median_rel_value': relative_errors.abs().median(),
        'q25_abs_value': absolute_errors.abs().quantile(0.25),
        'q25_rel_value': relative_errors.abs().quantile(0.25),
        'q75_abs_value': absolute_errors.abs().quantile(0.75),
        'q75_rel_value': relative_errors.abs().quantile(0.75)
    })
    
    path = os.path.join(path_to_files, 'statistics', f'general_statistics_{day}.xlsx')
    
    if os.path.exists(path):
    
        gen_stat_df = pd.read_excel(path)
        gen_stat_df = pd.concat([gen_stat_df, stat]).drop_duplicates()

        gen_stat_df.to_excel(path, index=False)
        
    else:
        stat.to_excel(path, index=False)    
    
    
    stat_per_h = pd.DataFrame({
        'mean_abs_value': absolute_errors.abs().groupby(df.index.hour).mean(),
        'mean_rel_value': relative_errors.abs().groupby(df.index.hour).mean(),
        'median_abs_value': absolute_errors.abs().groupby(df.index.hour).median(),
        'median_rel_value': relative_errors.abs().groupby(df.index.hour).median(),
        'q25_abs_value': absolute_errors.abs().groupby(df.index.hour).quantile(0.25),
        'q25_rel_value': relative_errors.abs().groupby(df.index.hour).quantile(0.25),
        'q75_abs_value': absolute_errors.abs().groupby(df.index.hour).quantile(0.75),
        'q75_rel_value': relative_errors.abs().groupby(df.index.hour).quantile(0.75),
    })
    
    return stat, stat_per_h

In [15]:
def get_best_models_per_hour(stat_per_h, day):
    pass

    return best_models

In [16]:
path_to_all = '/masters_diploma/'
models_list = ['xgboost', 'random_forest', 'lightgbm']

paths, metadata_paths, metadata_dict, exp_names = get_paths(models_list)

fact_temperature = facts(path_to_all)
fact_pred = fact_temperature.copy()


for key, metadata in metadata_dict.items():
#     print(key, metadata)
    
    exp_name = f"{metadata['model_name']}_{key}"
    
    for exp_forecasts in paths:
        
        k = exp_forecasts[0].split("\\")[-2].split('-')
        exp = "-".join([k[0], k[-2]])
        
        if exp == exp_name:

            fact_pred = make_forecasts_df(fact_pred, exp_forecasts, exp)
#             print(len(fact_pred.columns))
        else:
            continue

fact_pred = fact_pred.loc['2025-01-01':'2025-01-08']

for d in range(4):
    stat, stat_per_h = get_stat(fact_pred, metadata, d, path_to_all)
    best_models_df = get_best_models_per_hour(stat_per_h, d)


164
3
DatetimeIndex(['2025-01-01 00:00:00', '2025-01-01 01:00:00',
               '2025-01-01 02:00:00', '2025-01-01 03:00:00',
               '2025-01-01 04:00:00', '2025-01-01 05:00:00',
               '2025-01-01 06:00:00', '2025-01-01 07:00:00',
               '2025-01-01 08:00:00', '2025-01-01 09:00:00',
               ...
               '2025-01-08 14:00:00', '2025-01-08 15:00:00',
               '2025-01-08 16:00:00', '2025-01-08 17:00:00',
               '2025-01-08 18:00:00', '2025-01-08 19:00:00',
               '2025-01-08 20:00:00', '2025-01-08 21:00:00',
               '2025-01-08 22:00:00', '2025-01-08 23:00:00'],
              dtype='datetime64[ns]', name='date_time', length=192, freq=None)


ValueError: If using all scalar values, you must pass an index

In [None]:
fact_pred