In [1]:
import numpy as np
import warnings

import pandas as pd
import copy
warnings.filterwarnings('ignore')
import wandb
from pathlib import Path

from views_forecasts.extensions import *
from utils import fetch_data, transform_data, get_config_path, get_config_from_path, retrain_transformed_sweep, evaluate

from sklearn.metrics import mean_squared_error
# import importlib
# import utils  
# importlib.reload(utils)

import os
os.environ['WANDB_SILENT'] = 'true'

PARA_DICT = {
    'rf': ['transform', 'n_estimators', 'n_jobs', 'learning_rate', 'max_depth', 'min_child_weight', 'subsample', 'colsample_bytree'],
    'xgb': ['transform', 'n_estimators', 'n_jobs', 'learning_rate', 'max_depth', 'min_child_weight', 'subsample', 'colsample_bytree'],
    'gbm': ['transform', 'n_estimators', 'n_jobs', 'learning_rate', 'max_depth', 'min_samples_split', 'min_samples_leaf']
}

In [3]:
level = 'cm'
config_path = Path('./my_config')

In [4]:
transforms = ['raw', 'log', 'normalize', 'standardize']
Datasets_transformed = {}
para_transformed = {}
qslist, Datasets = fetch_data(level)
for t in transforms:
    Datasets_transformed[t], para_transformed[t] = transform_data(Datasets, t, by_group=True)

Fetching query sets
Fetching datasets
 .     Transforming datasets
Transforming datasets
Transforming datasets
Transforming datasets


In [5]:
def train():
    run = wandb.init(config=common_config, project=wandb_config['project'], entity=wandb_config['entity'])
    wandb.config.update(model_config, allow_val_change=True)
    
    run_name = ''
    for para in sweep_paras:
        run_name += f'{para}_{run.config[para]}_'
    run_name = run_name.rstrip('_')
    wandb.run.name = run_name
    
    retrain_transformed_sweep(Datasets_transformed, sweep_paras)
    evaluate('calib', para_transformed, by_group=True)
    run.finish()

In [12]:
common_config_path, wandb_config_path, model_config_path, sweep_config_path = get_config_path(config_path)
common_config = get_config_from_path(common_config_path, 'common')
wandb_config = get_config_from_path(wandb_config_path, 'wandb')

In [13]:
for sweep_file in sweep_config_path.iterdir():
    if sweep_file.is_file():
        model_file = model_config_path / sweep_file.name
        if not model_file.is_file():
            raise FileNotFoundError(f'The corresponding model configuration file {model_file} does not exist.')

        sweep_config = get_config_from_path(sweep_file, 'sweep')
        model_config = get_config_from_path(model_file, 'model')
    
        if sweep_file.stem.split('_')[-2] == 'hurdle':
            continue # Currently Hurdle models are not supported
        model = sweep_file.stem.split('_')[-1]
        sweep_paras = PARA_DICT[model]
        sweep_id = wandb.sweep(sweep_config, project=wandb_config['project'],
                               entity=wandb_config['entity'])
        wandb.agent(sweep_id, function=train)
    break

Create sweep with ID: 0wap8h48
Sweep URL: https://wandb.ai/model-development-and-deployment/training_example_4/sweeps/0wap8h48
transform_log_n_estimators_100_n_jobs_12_learning_rate_0.05_max_depth_12_min_child_weight_12_subsample_0.5_colsample_bytree_0.5
{'n_estimators': 100, 'n_jobs': 12, 'learning_rate': 0.05, 'max_depth': 12, 'min_child_weight': 12, 'subsample': 0.5, 'colsample_bytree': 0.5}
Training model fatalities003_nl_baseline_rf
Calibration partition (log)
 * == Performing a run: "fatalities003_nl_baseline_rf_calib_transform_log_n_estimators_100_n_jobs_12_learning_rate_0.05_max_depth_12_min_child_weight_12_subsample_0.5_colsample_bytree_0.5" == * 
Training model(s)...
Storing "fatalities003_nl_baseline_rf_calib_transform_log_n_estimators_100_n_jobs_12_learning_rate_0.05_max_depth_12_min_child_weight_12_subsample_0.5_colsample_bytree_0.5"
Getting predictions
pr_56_cm_fatalities003_nl_baseline_rf_calib_transform_log_n_estimators_100_n_jobs_12_learning_rate_0.05_max_depth_12_min_