In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import re

import os
from pathlib import Path

from glob import glob
from tqdm import tqdm

import yaml
from yaml import dump
import uuid
import itertools

In [3]:
import warnings
warnings.simplefilter(action="ignore")

In [4]:
def get_paths(models_list):
    
    """
    Finds all the paths to forecasts and experiments metadata (directories /forecast/ and /wf_result/)

    Returns:
        forecast_paths   : list[str]  - шляхи до всіх CSV з прогнозами (останній research_task_* для кожної моделі)
        metadata_paths   : list[str]  - шляхи до всіх CSV з метаданими (останній research_task_* для кожної моделі)
        metadata         : dict       - словник, зчитаний з відповідних YAML файлів (за іменами CSV -> .yaml)
        experiment_names : list[str]  - [f"{model_name}_{short_uuid}"]
    """
    
    
    base_forecast = Path("/masters_diploma/forecast")
    base_info     = Path("/masters_diploma/wf_result")

    forecast_paths = []
    metadata_paths = []
    metadata = {}
    experiment_names = []

    def latest_task_dir(root: Path) -> Path | None:
        candidates = list(root.glob("research_task_*"))
        if not candidates:
            return None
        return max(candidates, key=lambda p: p.stat().st_ctime)

    for model in models_list:
        info_root = base_info / model
        info_task = latest_task_dir(info_root)
        if info_task and info_task.is_dir():
            mp = sorted([str(p) for p in info_task.glob("*.csv")])
            metadata_paths.extend(mp)
            
        pred_root = base_forecast / model
        pred_task = latest_task_dir(pred_root)
        if pred_task and pred_task.is_dir():
            fps = sorted([str(p) for p in pred_task.rglob("*.csv")])
            forecast_paths.extend(fps)

            
    yaml_file_paths = []
    for csv_path in metadata_paths:
        y = Path(csv_path).with_suffix(".yaml")
        if y.exists():
            yaml_file_paths.append(str(y))

    for file in yaml_file_paths:
        try:
            yaml.SafeLoader.add_constructor(
                'tag:yaml.org,2002:python/tuple',
                lambda loader, node: tuple(loader.construct_sequence(node))
            )

            with open(file, "r", encoding="utf-8") as f:
                res = yaml.load(f, Loader=yaml.SafeLoader)
        except Exception:
            continue

        uid = res.get("unique_uuid", "")
        parts = uid.split("-")
        if len(parts) >= 2:
            shorten_uuid = "-".join([parts[0], parts[-2]])
        else:
            shorten_uuid = uid or "unknown"

        dur = res.get("duration_training_history", res.get("train_start"))
        hp = (res.get("model_hyperparameters"), res.get("fit_kwargs")) if res.get("model_name") == 'sarimax' else res.get("model_hyperparameters")
        

        metadata[shorten_uuid] = {
            "uuid": uid,
            "model_name": res.get("model_name"),
            "duration/train_start": dur,
            "hyperparameters": hp,
            "features": res.get("train_features"),
        }

        experiment_names.append(f"{res.get('model_name')}_{shorten_uuid}")

        
    return forecast_paths, metadata_paths, metadata, experiment_names

In [5]:
def facts(path_to_all):

    dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d %H:%M:%S')
    path_to_weather = f'{path_to_all}/processed_data/history_weather.csv'

    fact_temperature = pd.read_csv(
        path_to_weather,
        parse_dates=['date'],
        index_col='date', 
        date_parser=dateparse
    )[['temperature']]
    
    fact_temperature.index.name = 'date_time'

    return fact_temperature

In [6]:
def make_forecasts_df(fact_pred, paths_to_exp_forecasts, exp_name):

    '''
    Creating a dataframe of forecasted temperature values
    '''

    dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d %H:%M:%S')
    
    df = fact_pred.copy()
    
    for num_exp, day_pred in enumerate([paths_to_exp_forecasts]):
        _ = day_pred.split('_')
        if "_".join([_[-4], _[-3]]) == 'random_forest':
            d = _[-5]
        else:
            d = _[-4]
            
        day_date = day_pred.split('\\')[-1].split('_')[-1].split(')')[0].split('(')[1]
#         print(day_date)

        pred = pd.read_csv(
            day_pred,
            parse_dates=['date_time'],
            index_col='date_time', 
            date_parser=dateparse
        )
        
        for h in range(24):
            try:
                col_name = pred.columns[0]
                df.loc[pd.to_datetime(day_date) + timedelta(hours=h), f'{exp_name}_{d}'] = pred.loc[pd.to_datetime(day_date) + timedelta(hours=h),col_name]
            
            except KeyError as e:
                
                print(day_pred)
                continue
                

    return df

In [6]:
path_to_all = '/masters_diploma/'
models_list = ['random_forest', 'xgboost', 'lightgbm', 'sarimax']

print('gathering experiment info...')
paths, metadata_paths, metadata_dict, exp_names = get_paths(models_list)

print('loading fact temperature dataset...')
fact_temperature = facts(path_to_all)
fact_pred = fact_temperature.copy()

print('adding experiments` forecasts...')


for exp_forecasts in tqdm(paths):
      
    k = exp_forecasts.split("\\")[-2].split('-')
#     k = exp_forecasts[0].split("\\")[-2].split('-')
    exp = "-".join([k[0], k[-2]])
    
    fact_pred = make_forecasts_df(fact_pred, exp_forecasts, exp)

gathering experiment info...
loading fact temperature dataset...
adding experiments` forecasts...


 94%|█████████████████████████████████████████████████████████████████████▎    | 18402/19650 [1:59:24<08:05,  2.57it/s]


In [None]:
# colors = [
#     "#4169E1", "#DC143C", "#228B22", "#DAA520", "#FF8C00", "#8A2BE2", "#00BFFF", "#FF1493"
# ]

# for feature in list(features.keys()):
    
#     fig = make_subplots(rows=1, cols=1, 
#                     subplot_titles=(f"Means of {feature.title()}"))

#     fig.add_trace(go.Scatter(
#         y=df.loc['2024-01-01':, feature],
#         x=df.loc['2024-01-01':,:].index,
#         name=feature,
#         marker_color="black"
#     ), row=1, col=1)
    
#     for i, name in enumerate([i for i in df.columns if i.startswith(f"{feature}_mean_")]):

#         fig.add_trace(go.Scatter(
#             y=df.loc['2024-01-01':, name],
#             x=df.loc['2024-01-01':,:].index,
#             name=name,
#             marker_color=colors[i]
#         ), row=1, col=1)

#     fig.update_layout(
#         title=f'Means of {feature.title()}', 
#         xaxis_tickangle=-45,
#         width=1300, height=1000,
#         hovermode="x"
#     )

#     fig.write_html(f'/masters_diploma/correlation_vizualizations/visualization_mean_{feature}.html')