In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

import re

import os
from glob import glob
from tqdm import tqdm

import yaml
from yaml import dump
import uuid
import itertools

In [3]:
import warnings
warnings.simplefilter(action="ignore")

In [4]:
def get_paths(models_list):
    '''
    Finds all the paths to forecasts and experiments metadata (directories /forecast/ and /wf_result/)
    
    Returns list with paths to forecast files, dict with metadata and list of all the experiment names
    '''
    
    uuids = []
    model_names = []
    train_start_or_duration = []
    hyperparameters = []
    features = []
    n_models = []
    
    paths_to_predictions = []
    paths_to_info = []

    forecast_paths = []
    metadata_paths = []


    for model in models_list:
        paths_to_predictions += glob(f'/masters_diploma/forecast/{model}/research_task_*/{model}_*/')
        paths_to_info += glob(f'/masters_diploma/wf_result/{model}/research_task_*')

    print(len(paths_to_predictions))
    print(len(paths_to_info))
            
    for path2 in paths_to_info:   
#     for path2 in [max(paths_to_info, key=os.path.getctime)]:   # тільки для останнього експерименту
        metadata_paths.extend(glob(os.path.join(path2, '*.csv')))


    for path2 in paths_to_predictions:
#     for path2 in [max(paths_to_predictions, key=os.path.getctime)]:   # тільки для останнього експерименту
        prediction_paths = glob(os.path.join(path2, f'*.csv'))
        if len(prediction_paths) > 0:
            forecast_paths.append(prediction_paths)

    yaml_file_paths = [f.replace('.csv', '.yaml') for f in metadata_paths]

    metadata = {}
    experiment_names = []
    for file in yaml_file_paths: 
        with open(file, 'r') as f:
            res = yaml.safe_load(f)

        shorten_uuid = "-".join([res['unique_uuid'].split('-')[0], res['unique_uuid'].split('-')[-2]])
        dur = res['duration_training_history'] if 'duration_training_history' in res else res['train_start']

        metadata[shorten_uuid] = {
            "uuid": res['unique_uuid'],
            "model_name": res['model_name'],
            "duration/train_start": dur,
            "hyperparameters": res['model_hyperparameters'],
            "features": res['train_features']
        }

        experiment_names.append(f"{res['model_name']}_{shorten_uuid}")


    # for file in yaml_file_paths: 
    #     with open(file, 'r') as f:
    #         res = yaml.safe_load(f)
    #     uuids.append(res['unique_uuid'])
    #     model_names.append(res['model_name'])
    #     hyperparameters.append(res['model_hyperparameters'])
    #     features.append(res['train_features'])
    #     if 'duration_training_history' in res:
    #         train_start_or_duration.append(res['duration_training_history'])
    #     else:
    #         train_start_or_duration.append(res['train_start'])
     
    # shorten_uuids = ["-".join([n.split('-')[0], n.split('-')[-2]]) for n in uuids]
    # metadata_lst = list(zip(uuids, shorten_uuids, model_names, train_start_or_duration, hyperparameters, features, n_models))           
    
    # file_paths_splited = [metadata_paths[k].split('-') for k in range(len(metadata_paths))]
    # model_name = [file_paths_splited[k][-5].split('\\')[1] for k in range(len(metadata_paths))]
    # experiment_names = ["-".join([model_name[k], file_paths_splited[k][-2]]) for k in range(len(metadata_paths))]     
    

    return forecast_paths, metadata_paths, metadata, experiment_names

In [5]:
def facts(path_to_all):

    dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d %H:%M:%S')
    path_to_weather = f'{path_to_all}/processed_data/history_weather.csv'

    fact_temperature = pd.read_csv(
        path_to_weather,
        parse_dates=['date'],
        index_col='date', 
        date_parser=dateparse
    )[['temperature']]
    
    fact_temperature.index.name = 'date_time'

    return fact_temperature

In [6]:
def make_forecasts_df(fact_pred, paths_to_exp_forecasts, info, exp_name):

    '''
    Creating a dataframe of forecasted temperature values
    '''

    dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d %H:%M:%S')
    
    for num_exp, day_pred in enumerate(paths_to_exp_forecasts):
        d = day_pred.split('_')[-4]
        day_date = day_pred.split('\\')[-1].split('_')[-1].split(')')[0].split('(')[1]
        print(day_date)

        pred = pd.read_csv(
            day_pred,
            parse_dates=['date_time'],
            index_col='date_time', 
            date_parser=dateparse
        )
        # print(day_pred)

        # print(pd.to_datetime(day_date) + timedelta(hours=1), f'{exp_name}_{d}')
        # with open(day_pred, 'r') as file:
        #     predictions_by_day = np.array([int(k) for k in file.readlines()[0].split(':')[2:-1]])

        for h in range(24):
            try:
                fact_pred.loc[pd.to_datetime(day_date) + timedelta(hours=h), f'{exp_name}_{d}'] = pred.loc[pd.to_datetime(day_date) + timedelta(hours=h),'0']
            except KeyError as e:
                print(day_pred)
                continue
            

                

    return fact_pred

In [7]:
def get_stat(fact_pred, exp_name, day):
    pass

    return stat, stat_per_h

In [8]:
def get_best_models_per_hour(stat_per_h, day):
    pass

    return best_models

In [9]:
path_to_all = '/masters_diploma/'
models_list = ['xgboost', 'random_forest', 'lightgbm']

paths, metadata_paths, metadata_dict, exp_names = get_paths(models_list)


for key, metadata in metadata_dict.items():
    print(key, metadata)
    
    exp_name = f"{metadata['model_name']}_{key}"

    fact_temperature = facts(path_to_all)
    fact_pred = fact_temperature.copy()
    
    for exp_forecasts in paths:
        
        k = exp_forecasts[0].split("\\")[-2].split('-')
        exp = "-".join([k[0], k[-2]])
        
        if exp == exp_name:

            print(exp_name)

            fact_pred = make_forecasts_df(fact_pred, exp_forecasts, metadata, exp_name)
            
        else:
            pass

    fact_pred = fact_pred.dropna()

    # for d in range(4):
    #     stat, stat_per_h = get_stat(fact_pred, exp_name, d)
    #     best_models_df = get_best_models_per_hour(stat_per_h, d)


# gen_stat_df = df.read_csv(os.path.join(path_to_files, 'statistics', 'general_statistics'))
# gen_stat_df = pd.concat([gen_stat_df, stat]).drop_duplicates()

# gen_stat_df.to_csv(os.path.join(path_to_files, 'statistics', 'general_statistics'))

164
3
dcde754a-92fb {'uuid': 'dcde754a-df0d-11ef-92fb-3417ebde98e4', 'model_name': 'xgboost', 'duration/train_start': datetime.datetime(2020, 1, 1, 0, 0), 'hyperparameters': {'booster': 'gbtree', 'colsample_bytree': 1, 'eta': 0.3, 'eval_metric': 'rmse', 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 50, 'n_jobs': -1, 'objective': 'reg:squarederror', 'random_state': 2, 'subsample': 1}, 'features': ['month', 'year_day', 'is_day', 'sunshine_duration', 'temperature_min_3_weeks', 'temperature_max_3_weeks', 'temperature_mean_3_weeks', 'temperature_min_3_years', 'temperature_max_3_years', 'temperature_mean_3_years', 'new_mean_temp_7_days', 'new_mean_cloud_7_days', 'new_mean_humid_7_days', 'mean_temp', 'cloud_cover_mean_7_days', 'pressure_msl_mean_7_days']}
xgboost_dcde754a-92fb
2025-01-01
2025-01-02
2025-01-03
2025-01-04
2025-01-05
2025-01-06
2025-01-07
2025-01-08
2025-01-02
2025-01-03
2025-01-04
2025-01-05
2025-01-06
2025-01-07
2025-01-08
2025-01-03
2025-01-04
2025-01-05
2025-01-06
2

In [10]:
fact_pred

Unnamed: 0_level_0,temperature,lightgbm_dcde9d86-8055_d-0,lightgbm_dcde9d86-8055_d-1,lightgbm_dcde9d86-8055_d-2,lightgbm_dcde9d86-8055_d-3
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-01-04 00:00:00,-1.7,0.547930,0.547930,0.547930,0.547930
2025-01-04 01:00:00,-1.2,0.415846,0.415846,0.415846,0.415846
2025-01-04 02:00:00,-1.0,0.415846,0.415846,0.415846,0.415846
2025-01-04 03:00:00,-0.8,0.081877,0.081877,0.081877,0.081877
2025-01-04 04:00:00,-0.7,0.090966,0.090966,0.090966,0.090966
...,...,...,...,...,...
2025-01-08 19:00:00,3.0,0.570231,0.570231,0.570231,0.570231
2025-01-08 20:00:00,3.2,0.513399,0.513399,0.513399,0.513399
2025-01-08 21:00:00,3.7,0.677445,0.677445,0.677445,0.677445
2025-01-08 22:00:00,4.8,0.513399,0.513399,0.513399,0.513399
