In [1]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import wandb
from pathlib import Path

from views_forecasts.extensions import *
from utils import fetch_data, transform_data, get_config_path, get_config_from_path, retrain_transformed_sweep, evaluate
from utils_map import plot_cm_map, plot_pgm_map

import os
os.environ['WANDB_SILENT'] = 'true'


In [2]:
level = 'cm'
config_path = Path('./my_config_cm')

In [3]:
transforms = ['raw', 'log', 'normalize', 'standardize']
Datasets_transformed = {}
para_transformed = {}
qslist, Datasets = fetch_data(level)
for t in transforms:
    Datasets_transformed[t], para_transformed[t] = transform_data(Datasets, t, level, by_group=True)

Fetching query sets
 .     Fetching datasets
 .     

In [16]:
def train():
    run = wandb.init()
    model_paras = [para for para in run.config.keys() if para !='transform']
    run_name = f'transform_{run.config["transform"]}'
    for para in model_paras:
        run_name += f'_{para}_{run.config[para]}'
    wandb.run.name = run_name
    # print(run_name)
    
    wandb.config.update(common_config, allow_val_change=True)
    wandb.config.update(model_config, allow_val_change=True)
    
    retrain_transformed_sweep(Datasets_transformed, model_paras)
    evaluate('calib', para_transformed, by_group=True)
    run.finish()

In [15]:
common_config_path, wandb_config_path, model_config_path, sweep_config_path = get_config_path(config_path)
common_config = get_config_from_path(common_config_path, 'common')
wandb_config = get_config_from_path(wandb_config_path, 'wandb')

In [12]:
for sweep_file in sweep_config_path.iterdir():
    if sweep_file.is_file():
        model_file = model_config_path / sweep_file.name
        if not model_file.is_file():
            raise FileNotFoundError(f'The corresponding model configuration file {model_file} does not exist.')

        sweep_config = get_config_from_path(sweep_file, 'sweep')
        model_config = get_config_from_path(model_file, 'model')
    
        if sweep_file.stem.split('_')[-2] == 'hurdle':
            continue # Currently Hurdle models are not supported
        model = sweep_file.stem.split('_')[-1]
        # sweep_paras = PARA_DICT[model]
        sweep_id = wandb.sweep(sweep_config, project=wandb_config['project'],
                               entity=wandb_config['entity'])
        wandb.agent(sweep_id, function=train)
        print(f'Finish sweeping over model {sweep_file.stem}')
    break

Create sweep with ID: 7wx39ouu
Sweep URL: https://wandb.ai/model-development-and-deployment/test001/sweeps/7wx39ouu
['learning_rate', 'n_estimators', 'n_jobs', 'transform']
['learning_rate', 'n_estimators', 'n_jobs', 'transform']
Finish sweeping over model fatalities003_pgm_baseline_lgbm


In [11]:
stepcols = ['ged_sb_dep']
steps = [*range(1, 36 + 1, 1)]
for step in steps:
    stepcols.append('step_pred_' + str(step))
run_id = 'Fatalities003'
name = 'cm_fatalities003_nl_baseline_rf_calib_transform_raw_n_estimators_100_n_jobs_15_learning_rate_0.05_max_depth_12_min_child_weight_12_subsample_0.5_colsample_bytree_0.5'

In [14]:
name = 'cm_fatalities003_nl_baseline_rf_calib_transform_raw_n_estimators_100_n_jobs_15_learning_rate_0.05_max_depth_12_min_child_weight_12_subsample_0.5_colsample_bytree_0.5'
df = pd.DataFrame.forecasts.read_store(run=run_id, name=name).replace([np.inf, -np.inf], 0)[stepcols]
df.describe()

pr_56_cm_fatalities003_nl_baseline_rf_calib_transform_raw_n_estimators_100_n_jobs_15_learning_rate_0.05_max_depth_12_min_child_weight_12_subsample_0.5_colsample_bytree_0.5.parquet


Unnamed: 0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
count,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,...,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0
mean,43.984293,2.763453,2.719436,2.763168,1.894036,1.928245,1.827149,1.621647,1.83769,1.649706,...,1.242454,1.237369,1.164332,1.202674,1.216925,1.16641,1.115325,1.138484,1.148242,1.224143
std,478.269169,3.672684,3.542995,3.772312,2.644334,2.367929,2.027407,1.869207,2.093655,1.758595,...,1.432982,1.447862,1.432327,1.397935,1.329776,1.451369,1.308437,1.410775,1.367916,1.252939
min,0.0,0.480216,0.480745,0.482173,0.490586,0.491545,0.485932,0.496379,0.491325,0.493941,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.748876,0.772803,0.729018,0.711627,0.79151,0.739163,0.752162,0.738416,0.753256,...,0.650071,0.647873,0.608584,0.632843,0.64985,0.600785,0.527729,0.576196,0.598334,0.699598
50%,0.0,2.634222,2.419861,2.716885,1.371664,1.39656,1.437981,1.095765,1.448651,1.228742,...,0.745319,0.72931,0.657638,0.716428,0.750603,0.643519,0.587826,0.602538,0.630868,0.793501
75%,0.0,3.265092,3.173449,3.235476,1.622635,1.757818,1.741029,1.335304,1.620372,1.421709,...,0.967016,0.993814,0.858438,0.96792,0.985358,0.851137,0.967359,0.89546,0.911281,1.062088
max,19000.0,46.281792,47.852463,49.803467,33.822624,28.254168,28.297398,17.863476,23.64448,18.066242,...,20.86784,22.295361,28.512068,24.516321,16.908724,20.12236,16.604708,29.344843,22.985306,15.077058


In [22]:
name = 'cm_fatalities003_nl_baseline_rf_calib_transform_log_n_estimators_100_n_jobs_15_learning_rate_0.05_max_depth_12_min_child_weight_12_subsample_0.5_colsample_bytree_0.5'
df = pd.DataFrame.forecasts.read_store(run=run_id, name=name).replace([np.inf, -np.inf], 0)[stepcols]
df = np.exp(df) - 1
df

pr_56_cm_fatalities003_nl_baseline_rf_calib_transform_log_n_estimators_100_n_jobs_15_learning_rate_0.05_max_depth_12_min_child_weight_12_subsample_0.5_colsample_bytree_0.5.parquet


Unnamed: 0_level_0,Unnamed: 1_level_0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
397,1,0.0,0.613933,0.616953,0.615754,0.614390,0.616320,0.616750,0.614932,0.620469,0.617564,...,0.617548,0.616562,0.615784,0.617495,0.619100,0.617310,0.613890,0.617663,0.614962,0.620047
397,2,0.0,0.628708,0.627454,0.626578,0.623116,0.627264,0.625510,0.623309,0.636764,0.630672,...,0.621717,0.615270,0.616696,0.617528,0.622300,0.619750,0.613505,0.619197,0.620475,0.622134
397,3,0.0,0.620750,0.618255,0.618701,0.618201,0.619136,0.617050,0.617961,0.624933,0.621965,...,0.616367,0.612105,0.616021,0.612847,0.615287,0.614563,0.611961,0.614759,0.615695,0.617963
397,4,0.0,0.623216,0.613123,0.613084,0.613172,0.614843,0.615930,0.614272,0.615191,0.614701,...,0.624097,0.625719,0.633003,0.623588,0.631281,0.633249,0.636581,0.634476,0.641765,0.637501
397,5,0.0,0.628965,0.627593,0.626561,0.623811,0.628646,0.624390,0.624241,0.636951,0.631354,...,0.622985,0.615514,0.616201,0.617878,0.622994,0.619938,0.613623,0.619270,0.621060,0.622389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,242,0.0,0.627109,0.635177,0.629947,0.625829,0.630717,0.627193,0.628726,0.621645,0.630045,...,0.640423,0.655568,0.629049,0.635070,0.639029,0.633139,0.637537,0.638622,0.649304,0.653889
444,243,0.0,0.646165,0.636574,0.633543,0.626536,0.643574,0.631771,0.638303,0.638083,0.628973,...,0.641926,0.659257,0.626271,0.643949,0.646233,0.638871,0.646016,0.635770,0.646017,0.653868
444,244,0.0,0.614472,0.614351,0.612793,0.612464,0.612587,0.613666,0.612700,0.614742,0.613690,...,0.620062,0.620757,0.621573,0.620810,0.616627,0.620747,0.615645,0.619771,0.616820,0.616574
444,245,0.0,0.694161,0.798478,0.900139,0.834527,0.846576,0.877188,0.880774,0.885611,0.882882,...,0.781598,0.763405,0.765723,0.832082,0.844601,0.834185,0.850778,0.833401,0.790267,0.765193


In [17]:
para_transformed['standardize']['baseline003']

Unnamed: 0_level_0,mean_val,std_val
country_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.000000,0.000000
2,0.000000,0.000000
3,0.045775,1.096525
4,0.191315,3.660668
5,0.000000,0.000000
...,...,...
248,0.000000,
250,0.000000,
252,0.000000,
253,1.000000,


In [20]:
def standardize_retransform(x, mean_val, std_val):
    return x * std_val + mean_val
name = 'cm_fatalities003_nl_baseline_rf_calib_transform_standardize_n_estimators_100_n_jobs_15_learning_rate_0.05_max_depth_12_min_child_weight_12_subsample_0.5_colsample_bytree_0.5'
df_para_model = para_transformed['standardize']['baseline003']
df = pd.DataFrame.forecasts.read_store(run=run_id, name=name).replace([np.inf, -np.inf], 0)[stepcols]
df = df.apply(lambda row: standardize_retransform(row, df_para_model['mean_val'].loc[row.name[1]], df_para_model['std_val'].loc[row.name[1]]), axis=1)
df

pr_56_cm_fatalities003_nl_baseline_rf_calib_transform_standardize_n_estimators_100_n_jobs_15_learning_rate_0.05_max_depth_12_min_child_weight_12_subsample_0.5_colsample_bytree_0.5.parquet


Unnamed: 0_level_0,Unnamed: 1_level_0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
397,1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
397,2,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
397,3,0.0,0.568336,0.568134,0.568391,0.566591,0.566438,0.566815,0.566233,0.567021,0.568354,...,0.566616,0.566050,0.565800,0.567109,0.566194,0.566755,0.565418,0.565348,0.565413,0.565428
397,4,0.0,1.916907,1.916631,1.907969,1.906660,1.917911,1.909212,1.909986,1.912638,1.906618,...,1.916026,1.918890,1.932348,1.915606,1.918482,1.927866,1.924985,1.921892,1.930486,1.924796
397,5,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,242,0.0,0.544947,0.548019,0.551224,0.547349,0.547731,0.547068,0.548666,0.548416,0.547005,...,0.554154,0.553195,0.550166,0.550887,0.552065,0.550052,0.549318,0.556122,0.554823,0.553363
444,243,0.0,3.179936,3.166709,3.163791,3.158991,3.160678,3.169139,3.157281,3.165691,3.173427,...,3.174238,3.182089,3.155188,3.166767,3.164034,3.156653,3.177890,3.168548,3.178139,3.168908
444,244,0.0,0.287449,0.287058,0.287274,0.287270,0.287344,0.287443,0.287834,0.288015,0.287467,...,0.288000,0.287278,0.287829,0.287174,0.287844,0.287389,0.289995,0.287656,0.287030,0.287577
444,245,0.0,169.287914,171.018399,172.103673,170.782257,170.767298,170.756183,170.928340,170.894070,170.555161,...,169.847792,169.446313,169.744989,170.488995,170.782842,170.696890,170.838378,170.295155,169.780164,169.350488


In [21]:
def normalize_retransform(x, min_val, max_val, b=1, a=0):
    return (x - a) / (b - a) * (max_val - min_val) + min_val
name = 'cm_fatalities003_nl_baseline_rf_calib_transform_normalize_n_estimators_100_n_jobs_15_learning_rate_0.05_max_depth_12_min_child_weight_12_subsample_0.5_colsample_bytree_0.5'
df_para_model = para_transformed['normalize']['baseline003']
df = pd.DataFrame.forecasts.read_store(run=run_id, name=name).replace([np.inf, -np.inf], 0)[stepcols]
df = df.apply(lambda row: normalize_retransform(row, df_para_model['min_val'].loc[row.name[1]], df_para_model['max_val'].loc[row.name[1]]), axis=1)
df


pr_56_cm_fatalities003_nl_baseline_rf_calib_transform_normalize_n_estimators_100_n_jobs_15_learning_rate_0.05_max_depth_12_min_child_weight_12_subsample_0.5_colsample_bytree_0.5.parquet


Unnamed: 0_level_0,Unnamed: 1_level_0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
397,1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
397,2,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
397,3,0.0,14.746427,14.736396,14.736063,14.731823,14.731092,14.732048,14.728331,14.731538,14.732114,...,14.729359,14.728168,14.728981,14.729588,14.728914,14.730579,14.727331,14.727019,14.726715,14.727497
397,4,0.0,44.668164,44.661583,44.659625,44.662051,44.671018,44.675338,44.661264,44.658174,44.672436,...,44.673901,44.694856,44.690306,44.684804,44.699713,44.702128,44.710939,44.715611,44.709955,44.697604
397,5,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,242,0.0,11.884318,11.887568,11.892138,11.880030,11.885156,11.880813,11.884962,11.896341,11.889368,...,11.896967,11.899932,11.880910,11.889268,11.886983,11.885408,11.883394,11.887187,11.899642,11.893121
444,243,0.0,82.752975,82.744921,82.723121,82.693921,82.734260,82.716271,82.747084,82.766997,82.727093,...,82.729873,82.769180,82.717168,82.746119,82.770429,82.716753,82.732606,82.718293,82.798629,82.751222
444,244,0.0,5.701192,5.700891,5.701339,5.701179,5.701680,5.700988,5.701654,5.701597,5.701646,...,5.701873,5.701480,5.701562,5.701800,5.701632,5.701535,5.702786,5.702155,5.701668,5.702101
444,245,0.0,283.673317,284.429160,285.377931,284.684418,284.766448,284.755489,284.891159,284.817942,284.527486,...,283.939055,283.795121,283.869650,284.467958,284.630210,284.633880,284.569441,284.349471,284.030341,283.822163


In [113]:
stepcols = ['ged_sb_dep']
steps = [*range(1, 36 + 1, 1)]
for step in steps:
    stepcols.append('step_pred_' + str(step))
run_id = 'Fatalities003'
name = 'cm_fatalities003_nl_topics_rf_calib_transform_raw_n_estimators_250_n_jobs_12_learning_rate_0.05_max_depth_12_min_child_weight_12_subsample_0.5_colsample_bytree_0.5'
df = pd.DataFrame.forecasts.read_store(run=run_id, name=name).replace([np.inf, -np.inf], 0)[stepcols]
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
397,1,0.0,1.888033,1.536575,2.049761,1.194238,1.080280,1.230281,1.012267,0.979016,1.124231,...,0.605951,0.616939,0.591293,0.696502,0.823638,0.557715,0.525643,0.541731,0.555350,0.671805
397,2,0.0,3.313491,2.442976,3.095902,1.791524,1.584468,1.561841,1.353563,1.191040,1.440145,...,0.726233,0.770196,0.647228,0.721609,0.706091,0.572721,0.510407,0.583434,0.579779,0.788106
397,3,0.0,1.746300,1.800268,1.744825,1.028135,1.015283,1.214884,0.834816,0.719306,0.796419,...,1.161782,1.024357,0.661682,0.779543,0.571230,0.505648,0.748100,0.514143,0.524082,0.612821
397,4,0.0,0.487359,0.500316,0.511034,0.510376,0.575729,0.510639,0.576125,0.510816,0.504026,...,0.665250,0.683974,0.645012,0.484309,1.083846,1.449021,1.139949,0.482186,0.479546,0.479713
397,5,0.0,3.312054,2.441470,3.093398,1.789339,1.576807,1.559876,1.353323,1.190878,1.439366,...,0.723306,0.757622,0.646705,0.720935,0.699926,0.572176,0.510458,0.581869,0.579295,0.787506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,242,0.0,4.505231,5.732021,4.511056,11.456407,1.927837,1.705806,2.175097,7.634606,8.995864,...,9.225223,3.546402,3.188511,5.113416,3.803534,3.561557,3.248026,2.929720,2.788255,3.098599
444,243,0.0,0.564945,0.647136,0.593240,0.794530,0.965720,0.944105,0.807292,1.222221,1.019295,...,1.075368,1.527027,0.485269,0.516639,0.520443,0.820115,0.501266,0.590600,0.635062,1.199985
444,244,0.0,0.670239,1.084977,0.824759,0.710583,1.426395,1.099138,1.305220,1.311780,0.932088,...,2.428319,3.952767,0.813094,1.311723,0.843666,1.109788,2.646547,1.136765,1.150167,1.101795
444,245,0.0,5.004318,6.232212,8.742626,7.999347,5.930683,6.081397,5.129094,5.645001,6.278614,...,3.930102,2.930245,3.637518,4.189796,4.103630,3.408108,2.996001,3.379579,2.662481,2.405901


In [114]:
df.describe()

Unnamed: 0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
count,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,...,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0,9168.0
mean,43.984293,3.064313,2.938524,3.024721,2.463671,2.344805,2.241646,2.080354,2.044865,2.113336,...,1.541634,1.638868,1.574594,1.621234,1.630235,1.54555,1.448616,1.550858,1.50365,1.700572
std,478.269169,4.013247,3.939282,3.012343,2.652986,2.608635,2.326226,2.094101,2.217072,2.18277,...,1.607832,1.771869,1.72824,1.659796,1.791969,1.802667,1.751915,1.778156,1.798335,2.024134
min,0.0,0.476367,0.476422,0.476632,0.476452,0.476629,0.476453,0.476443,0.47649,0.476624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.141073,1.428083,1.226387,1.061084,1.03064,1.07464,0.995747,0.931747,0.949853,...,0.71953,0.752138,0.645578,0.720144,0.699926,0.572176,0.51293,0.581869,0.579295,0.768986
50%,0.0,2.967103,2.444391,3.093398,1.789339,1.576807,1.629165,1.357512,1.201662,1.439366,...,0.785038,0.801496,0.758299,0.81774,0.767103,0.69922,0.666647,0.680501,0.691359,0.833531
75%,0.0,3.312054,2.83725,3.34623,2.211347,2.121895,2.07509,1.941592,2.056416,2.010623,...,1.74657,1.930422,1.836017,1.928235,1.854694,1.79206,1.655463,1.842366,1.660575,1.698188
max,19000.0,57.567913,58.776665,34.229748,25.622738,30.41069,36.324074,19.378551,22.624584,20.225647,...,19.464668,20.670267,17.030479,17.005161,19.832689,28.565922,27.920639,24.754251,28.713552,22.110678


In [147]:
months = df.index.levels[0].tolist()
step_preds = [f'step_pred_{i}' for i in range(1, 37)]
wandb.init(project='test', entity='model-development-and-deployment')
for month in [399, 400]:
    for step in ['step_pred_1']:
        fig = plot_cm_map(df, month, step)
        
        wandb.log({f'month_{month}_step_{step}': wandb.Image(fig)})
        
wandb.finish()

In [2]:
level = 'pgm'
transforms = ['raw', 'log', 'normalize', 'standardize']
qslist, Datasets = fetch_data(level)

Fetching query sets
 .     
A dataset with 8 columns, with data between t 1 and 852. (13110 units)
 .     
A dataset with 19 columns, with data between t 1 and 852. (13110 units)
 .     
A dataset with 29 columns, with data between t 1 and 852. (13110 units)
 .     
A dataset with 24 columns, with data between t 1 and 852. (13110 units)
 .     
A dataset with 23 columns, with data between t 1 and 852. (13110 units)
 .     
A dataset with 30 columns, with data between t 1 and 852. (13110 units)
 .     
A dataset with 8 columns, with data between t 1 and 852. (13110 units)
 .     
A dataset with 11 columns, with data between t 1 and 852. (13110 units)
Fetching datasets
 .     

In [23]:
Datasets_transformed = {}
para_transformed = {}
for t in transforms:
    Datasets_transformed[t], para_transformed[t] = transform_data(Datasets, t, level, by_group=True)

In [26]:
config_path = Path('./my_config_pgm')
common_config_path, wandb_config_path, model_config_path, sweep_config_path = get_config_path(config_path)
common_config = get_config_from_path(common_config_path, 'common')
wandb_config = get_config_from_path(wandb_config_path, 'wandb')

In [27]:
model_name = 'fatalities003_pgm_baseline_lgbm'
for sweep_file in sweep_config_path.iterdir():
    if sweep_file.is_file():
        # Skip if a specific model name is provided and it doesn't match the file
        model_name_from_file = sweep_file.stem
        if model_name and model_name != model_name_from_file:
            continue
            
        model_file = model_config_path / sweep_file.name
        if not model_file.is_file():
            raise FileNotFoundError(f'The corresponding model configuration file {model_file} does not exist.')

        sweep_config = get_config_from_path(sweep_file, 'sweep')
        model_config = get_config_from_path(model_file, 'model')
    
        if sweep_file.stem.split('_')[-2] == 'hurdle':
            continue # Currently Hurdle models are not supported
        model = sweep_file.stem.split('_')[-1]
        # sweep_paras = PARA_DICT[model]
        sweep_id = wandb.sweep(sweep_config, project=wandb_config['project'],
                               entity=wandb_config['entity'])
        wandb.agent(sweep_id, function=train)
        print(f'Finish sweeping over model {sweep_file.stem}')
    break
    

Create sweep with ID: 40divblj
Sweep URL: https://wandb.ai/model-development-and-deployment/pgm_example_1/sweeps/40divblj
transform_log_learning_rate_0.05_n_estimators_100_n_jobs_12
{'learning_rate': 0.05, 'n_estimators': 100, 'n_jobs': 12}
Training model fatalities003_pgm_baseline_lgbm
Calibration partition (log)
 * == Performing a run: "pgm_fatalities003_pgm_baseline_lgbm_calib_transform_log_learning_rate_0.05_n_estimators_100_n_jobs_12" == * 
Training model(s)...
Storing "pgm_fatalities003_pgm_baseline_lgbm_calib_transform_log_learning_rate_0.05_n_estimators_100_n_jobs_12"
Getting predictions
pr_56_pgm_fatalities003_pgm_baseline_lgbm_calib_transform_log_learning_rate_0.05_n_estimators_100_n_jobs_12.parquet
pgm_fatalities003_pgm_baseline_lgbm_calib_transform_log_learning_rate_0.05_n_estimators_100_n_jobs_12 , run Fatalities003 does not exist, predicting
Test partition (log)
 * == Performing a run: "pgm_fatalities003_pgm_baseline_lgbm_test_transform_log_learning_rate_0.05_n_estimators

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01116900324446356, max=1.0)…

transform_normalize_learning_rate_0.05_n_estimators_100_n_jobs_12
{'learning_rate': 0.05, 'n_estimators': 100, 'n_jobs': 12}
Training model fatalities003_pgm_baseline_lgbm
Calibration partition (normalize)
 * == Performing a run: "pgm_fatalities003_pgm_baseline_lgbm_calib_transform_normalize_learning_rate_0.05_n_estimators_100_n_jobs_12" == * 
Training model(s)...
Storing "pgm_fatalities003_pgm_baseline_lgbm_calib_transform_normalize_learning_rate_0.05_n_estimators_100_n_jobs_12"
Getting predictions
pr_56_pgm_fatalities003_pgm_baseline_lgbm_calib_transform_normalize_learning_rate_0.05_n_estimators_100_n_jobs_12.parquet
pgm_fatalities003_pgm_baseline_lgbm_calib_transform_normalize_learning_rate_0.05_n_estimators_100_n_jobs_12 , run Fatalities003 does not exist, predicting
Finish sweeping over model fatalities003_pgm_baseline_lgbm


In [5]:
stepcols = ['ged_sb_dep']
steps = [*range(1, 36 + 1, 1)]
for step in steps:
    stepcols.append('step_pred_' + str(step))
run_id = 'Fatalities003'
name = 'pgm_fatalities003_pgm_baseline_lgbm_calib_transform_raw_learning_rate_0.05_n_estimators_100_n_jobs_12'
df = pd.DataFrame.forecasts.read_store(run=run_id, name=name).replace([np.inf, -np.inf], 0)[stepcols]
df.describe()

pr_56_pgm_fatalities003_pgm_baseline_lgbm_calib_transform_raw_learning_rate_0.05_n_estimators_100_n_jobs_12.parquet


Unnamed: 0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
count,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,...,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0
mean,0.357143,0.36296,0.235796,0.200281,0.192233,0.207696,0.25336,0.26069,0.217874,0.2099,...,0.085503,0.115182,0.102013,0.067152,0.109287,0.095474,0.082626,0.070612,0.074306,0.075961
std,10.74229,22.604209,9.727253,7.633099,6.278958,7.467056,10.196756,11.820635,7.514221,7.488593,...,3.642885,6.255992,6.586327,3.511985,5.003973,5.118448,3.47279,3.366107,3.60937,3.346155
min,0.0,-369.145596,-239.012891,-124.988114,-323.638512,-198.734169,-123.406325,-115.332386,-135.203092,-101.517689,...,-197.290655,-145.53194,-206.145965,-386.248113,-369.163407,-249.28029,-290.583883,-219.790766,-197.321224,-355.121646
25%,0.0,0.017658,0.019247,0.024417,0.025745,0.021106,0.024519,0.024257,0.025342,0.029282,...,0.00843,0.010251,0.012131,0.008328,0.014634,0.014106,0.013854,0.015003,0.015476,0.017671
50%,0.0,0.017658,0.019247,0.024417,0.025745,0.021106,0.024519,0.024257,0.025342,0.029282,...,0.00843,0.010251,0.012395,0.008328,0.014634,0.014106,0.013854,0.015003,0.015476,0.017671
75%,0.0,0.017658,0.019247,0.024417,0.025745,0.024515,0.024519,0.024257,0.025342,0.029282,...,0.013225,0.030053,0.014238,0.012387,0.024532,0.028845,0.017388,0.022591,0.017477,0.022242
max,1580.0,3393.240804,3724.499363,2443.864943,1968.052512,1593.697184,1491.184652,1583.798864,1334.298983,2667.634904,...,696.297475,2566.18675,2621.295037,1203.775368,1397.033959,2085.250237,879.143073,1096.635328,1179.015076,619.578114


In [6]:
stepcols = ['ged_sb_dep']
steps = [*range(1, 36 + 1, 1)]
for step in steps:
    stepcols.append('step_pred_' + str(step))
run_id = 'Fatalities003'
name = 'pgm_fatalities003_pgm_baseline_lgbm_calib_transform_log_learning_rate_0.05_n_estimators_100_n_jobs_12'
df = pd.DataFrame.forecasts.read_store(run=run_id, name=name).replace([np.inf, -np.inf], 0)[stepcols]
df = np.exp(df) - 1
df.describe()

pr_56_pgm_fatalities003_pgm_baseline_lgbm_calib_transform_log_learning_rate_0.05_n_estimators_100_n_jobs_12.parquet


Unnamed: 0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
count,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,...,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0,629280.0
mean,0.357143,0.02541,0.022007,0.022355,0.020502,0.017919,0.017827,0.018912,0.016739,0.016003,...,0.011237,0.010115,0.010272,0.00974,0.010019,0.009432,0.009757,0.009439,0.008788,0.008548
std,10.74229,0.370579,0.320609,0.323944,0.296664,0.244225,0.267098,0.309219,0.237498,0.243263,...,0.203521,0.155079,0.167599,0.141783,0.165464,0.141041,0.173581,0.152443,0.134684,0.115434
min,0.0,-0.075677,-0.042418,-0.069909,-0.026511,-0.139463,-0.036578,-0.061792,-0.052306,-0.070387,...,-0.018436,-0.001362,-0.045369,-0.029317,-0.074989,-0.052877,-0.0179,-0.042569,-0.059149,-0.034537
25%,0.0,0.000629,0.000654,0.000694,0.000698,0.000737,0.000735,0.000778,0.000755,0.000747,...,0.000782,0.000783,0.000716,0.000719,0.000746,0.00075,0.00075,0.000789,0.000731,0.00073
50%,0.0,0.000629,0.000654,0.000694,0.000698,0.000737,0.000735,0.000778,0.000755,0.000795,...,0.001086,0.001145,0.001064,0.001069,0.001074,0.001078,0.001098,0.001127,0.001113,0.001074
75%,0.0,0.000629,0.000654,0.000741,0.000821,0.000807,0.000885,0.000941,0.00093,0.000932,...,0.001426,0.001505,0.001441,0.001555,0.001477,0.001455,0.001437,0.001489,0.001493,0.001542
max,1580.0,36.395334,45.864467,29.982219,36.719954,28.361323,31.178444,29.255667,34.050488,41.508575,...,22.5626,19.041801,17.869934,13.054167,16.279341,13.031511,24.287855,21.757464,23.061903,12.259959


In [7]:
def standardize_retransform(x, mean_val, std_val):
    return x * std_val + mean_val
name = 'pgm_fatalities003_pgm_baseline_lgbm_calib_transform_standardize_learning_rate_0.05_n_estimators_100_n_jobs_12'
df_para_model = para_transformed['standardize']['baseline003']
df = pd.DataFrame.forecasts.read_store(run=run_id, name=name).replace([np.inf, -np.inf], 0)[stepcols]
df = df.apply(lambda row: standardize_retransform(row, df_para_model['mean_val'].loc[row.name[1]], df_para_model['std_val'].loc[row.name[1]]), axis=1)
df

NameError: name 'para_transformed' is not defined

In [4]:
months = df.index.levels[0].tolist()
step_preds = [f'step_pred_{i}' for i in range(1, 37)]
wandb.init(project='test', entity='model-development-and-deployment')
for month in [397]:
    for step in step_preds:
        fig = plot_pgm_map(df, month, step)
        
        wandb.log({f'month_{month}_{step}': wandb.Image(fig)})
        
wandb.finish()