
# ViEWS 3 constituent models 
## ViEWS production system, cm level


This notebook trains a set of regression models for use in the monthly updated ViEWS predicting fatalities ensemble

The notebook does the following: 
1. Retrieves data through querysets and stores in DataSets, a list of dictionaries
2. Specifies the metadata of a number of models, stores in ModelList, a list of dictionaries
3. Trains the models in ModelList, stores the trained objects in model storage and prediction storage
4. Saves part of ModelList as csv and the rest as pickles

## Importing modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

# Basics
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import matplotlib.cbook as cbook
# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from xgboost import XGBRFRegressor, XGBRFClassifier

from lightgbm import LGBMClassifier, LGBMRegressor

# Views 3
from viewser.operations import fetch
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
from views_runs import storage
from views_runs.storage import store, retrieve, fetch_metadata

from views_forecasts.extensions import *

# Other packages
import pickle as pkl

# Packages from viewsforecasting repository

#from Ensembling import CalibratePredictions, RetrieveStoredPredictions, mean_sd_calibrated, gam_calibrated
import os
import sys
sys.path.append('../')
sys.path.append('../Tools')
sys.path.append('../Intermediates')
from FetchData import FetchData, RetrieveFromList, document_queryset, ReturnQsList, document_ensemble
from ViewsEstimators import *


In [3]:
import sklearn
print(sklearn.__version__)

1.3.2


## Common parameters

In [4]:
# Common parameters:
dev_id = 'Fatalities002'
run_id = dev_id

# Generating a new run if necessary

#try:
#    ViewsMetadata().new_run(name=run_id,description='Developing the fatalities model for FCDO',min_month=1,max_month=999)
#except KeyError:
#    if 'devel' not in run_id:
#        warnings.warn('You are overwriting a production system')

RerunQuerysets = True

EndOfHistory = 517
steps = [*range(1, 36+1, 1)] # Which steps to train and predict for
fi_steps = [1,3,6,12,36] # Which steps to present feature importances for
#steps = [1,3,6,12,36]
#fi_steps = [1,3,6,12,36]

# Specifying partitions
calib_partitioner_dict = {"train":(121,408),"predict":(409,456)}
test_partitioner_dict = {"train":(121,456),"predict":(457,504)}
future_partitioner_dict = {"train":(121,504),"predict":(505,516)}
calib_partitioner =  views_runs.DataPartitioner({"calib":calib_partitioner_dict})
test_partitioner =  views_runs.DataPartitioner({"test":test_partitioner_dict})
future_partitioner =  views_runs.DataPartitioner({"future":future_partitioner_dict})

Mydropbox = f'/Users/{os.getlogin()}/Dropbox (ViEWS)/ViEWS'
print('Setting Mydropbox to',Mydropbox)

Setting Mydropbox to /Users/root/Dropbox (ViEWS)/ViEWS


# Retrieve data

In [5]:
! which python

/Users/sofia/mambaforge/envs/viewser_vimur/bin/python


In [6]:
# Create Markdown documentation of all querysets used
level = 'cm'
qslist = ReturnQsList(level)
document_queryset(qslist,dev_id)

32: "Queryset fatalities002_topics *transform in progress* - 22 of 452 jobs remaining"       

100%|██████████| 46.0M/46.0M [00:18<00:00, 2.50MB/s]


Queryset fatalities002_topics read successfully                                        
2: "Queryset fatalities002_aquastat *dispatched to transform queue* - columns to compute: 11"  

100%|██████████| 2.37M/2.37M [00:00<00:00, 3.19MB/s]


Queryset fatalities002_aquastat read successfully                                             
21: "Queryset fatalities002_conflict_history *transform in progress* - 26 of 178 jobs remaining"       

100%|██████████| 4.41M/4.41M [00:01<00:00, 2.99MB/s]


Queryset fatalities002_conflict_history read successfully                                        
32: "Queryset fatalities002_conflict_history_long *transform in progress* - 86 of 364 jobs remaining"      

100%|██████████| 8.47M/8.47M [00:04<00:00, 1.81MB/s]


Queryset fatalities002_conflict_history_long read successfully                                        
13: "Queryset fatalities002_vdem_short *transform in progress* - 65 of 331 jobs remaining"       

100%|██████████| 4.01M/4.01M [00:01<00:00, 2.71MB/s]


Queryset fatalities002_vdem_short read successfully                                        
22: "Queryset fatalities002_wdi_short *transform in progress* - 18 of 180 jobs remaining"       

100%|██████████| 5.18M/5.18M [00:01<00:00, 3.34MB/s]


Queryset fatalities002_wdi_short read successfully                                        
2: "Queryset fatalities002_joint_broad *dispatched to transform queue* - columns to compute: 7"  

100%|██████████| 13.9M/13.9M [00:04<00:00, 2.84MB/s]


Queryset fatalities002_joint_broad read successfully                                            
3: "Queryset fatalities002_faostat *dispatched to transform queue* - columns to compute: 35"  

100%|██████████| 3.59M/3.59M [00:01<00:00, 2.22MB/s]


Queryset fatalities002_faostat read successfully                                             
2: "Queryset fatalities002_faoprices *dispatched to transform queue* - columns to compute: 10"  

100%|██████████| 2.03M/2.03M [00:01<00:00, 1.86MB/s]


Queryset fatalities002_faoprices read successfully                                             
2: "Queryset fatalities002_imfweo *dispatched to transform queue* - columns to compute: 4"  

100%|██████████| 1.42M/1.42M [00:00<00:00, 3.42MB/s]


Queryset fatalities002_imfweo read successfully                                            
Model:  fatalities002_baseline
Model:  fatalities002_topics
Model:  fatalities002_aquastat
Model:  fatalities002_conflict_history
Model:  fatalities002_conflict_history_long
Model:  fatalities002_vdem_short
Model:  fatalities002_wdi_short
Model:  fatalities002_all_features
Model:  fatalities002_joint_narrow
Model:  fatalities002_joint_broad
Model:  fatalities002_faostat
Model:  fatalities002_faoprices
Model:  fatalities002_imfweo


In [7]:
from FetchData import fetch_cm_data_from_model_def

Datasets=fetch_cm_data_from_model_def(qslist, EndOfHistory)

100%|██████████| 1.42M/1.42M [00:00<00:00, 1.98MB/s]


Queryset fatalities002_imfweo read successfully 
imfweo: A dataset with 11 columns, with data between t = 1 and 852; 213 units.
2: "Queryset fatalities002_joint_narrow *dispatched to transform queue* - columns to compute: 2"  

100%|██████████| 3.81M/3.81M [00:01<00:00, 1.95MB/s]


Queryset fatalities002_joint_narrow read successfully                                            
joint_narrow: A dataset with 31 columns, with data between t = 1 and 852; 213 units.


100%|██████████| 8.47M/8.47M [00:02<00:00, 3.41MB/s]


Queryset fatalities002_conflict_history_long read successfully 
conflictlong_ln: A dataset with 62 columns, with data between t = 1 and 852; 213 units.


100%|██████████| 2.03M/2.03M [00:00<00:00, 2.32MB/s]


Queryset fatalities002_faoprices read successfully 
faoprices: A dataset with 17 columns, with data between t = 1 and 852; 213 units.


100%|██████████| 1.28M/1.28M [00:00<00:00, 2.37MB/s]


Queryset fatalities002_baseline read successfully 
baseline002: A dataset with 6 columns, with data between t = 1 and 852; 213 units.


100%|██████████| 2.37M/2.37M [00:01<00:00, 1.94MB/s]


Queryset fatalities002_aquastat read successfully 
aquastat: A dataset with 17 columns, with data between t = 1 and 852; 213 units.


100%|██████████| 13.9M/13.9M [00:06<00:00, 2.21MB/s]


Queryset fatalities002_joint_broad read successfully 
joint_broad: A dataset with 80 columns, with data between t = 1 and 852; 213 units.


100%|██████████| 4.41M/4.41M [00:01<00:00, 3.53MB/s]


Queryset fatalities002_conflict_history read successfully 
conflict_ln: A dataset with 30 columns, with data between t = 1 and 852; 213 units.


100%|██████████| 56.5M/56.5M [00:19<00:00, 2.83MB/s]


Queryset fatalities002_all_features read successfully 
all_features: A dataset with 189 columns, with data between t = 1 and 852; 213 units.


100%|██████████| 3.59M/3.59M [00:01<00:00, 2.24MB/s]


Queryset fatalities002_faostat read successfully 
faostat: A dataset with 41 columns, with data between t = 1 and 852; 213 units.


100%|██████████| 5.18M/5.18M [00:02<00:00, 2.57MB/s]


Queryset fatalities002_wdi_short read successfully 
wdi_short: A dataset with 34 columns, with data between t = 1 and 852; 213 units.


100%|██████████| 4.01M/4.01M [00:01<00:00, 2.01MB/s]


Queryset fatalities002_vdem_short read successfully 
vdem_short: A dataset with 64 columns, with data between t = 1 and 852; 213 units.


100%|██████████| 46.0M/46.0M [00:16<00:00, 2.77MB/s]


Queryset fatalities002_topics read successfully 
topics_002: A dataset with 70 columns, with data between t = 1 and 852; 213 units.
all_features [9.99999702e-01 2.97648790e-07 2.71474294e-13 5.36004218e-16
 5.98000121e-17 4.93573610e-17 1.26320806e-17 9.90132683e-18
 2.68556820e-18 5.72507731e-19 1.72781911e-19 3.75338064e-20
 1.80545456e-20 1.00420723e-20 4.16086375e-21 1.56095069e-21
 1.11980632e-21 9.61684717e-22 1.30640993e-23 1.13025407e-23]
[2.87777559e+16 1.57003398e+13 1.49941326e+10 6.66255939e+08
 2.22539765e+08 2.02177562e+08 1.02280875e+08 9.05531762e+07
 4.71601397e+07 2.17744737e+07 1.19620641e+07 5.57530069e+06
 3.86678719e+06 2.88382340e+06 1.85630245e+06 1.13697675e+06
 9.63004788e+05 8.92428327e+05 1.04015180e+05 9.67486756e+04]
topics [9.99999781e-01 1.82814384e-07 3.23193459e-08 2.74300212e-09
 1.01924676e-09 1.96394121e-16 8.29242922e-17 1.69048090e-17
 5.71992410e-18 4.57968849e-18]
[3.56143189e+10 1.52275443e+07 6.40259413e+06 1.86525306e+06
 1.13701020e+06 4.991

# Generating predictions
Using the ViEWS3 partitioning/stepshifting syntax. Training models for A: calibration partition and B: test partition, to test out some calibration routines. Most models trained with ln_ged_sb_best as outcome.

In [8]:
for ds in Datasets:
    if 'topics' in ds['Name']:
        print(ds['df'].columns)
    

Index(['ln_ged_sb_dep', 'ln_ged_sb', 'wdi_sp_pop_totl', 'topic_tokens_t1',
       'topic_tokens_t2', 'topic_tokens_t13', 'topic_ste_theta0_stock_t1',
       'topic_ste_theta0_stock_t2', 'topic_ste_theta0_stock_t13',
       'topic_ste_theta1_stock_t1', 'topic_ste_theta1_stock_t2',
       'topic_ste_theta1_stock_t13', 'topic_ste_theta2_stock_t1',
       'topic_ste_theta2_stock_t2', 'topic_ste_theta2_stock_t13',
       'topic_ste_theta3_stock_t1', 'topic_ste_theta3_stock_t2',
       'topic_ste_theta3_stock_t13', 'topic_ste_theta4_stock_t1',
       'topic_ste_theta4_stock_t2', 'topic_ste_theta4_stock_t13',
       'topic_ste_theta5_stock_t1', 'topic_ste_theta5_stock_t2',
       'topic_ste_theta5_stock_t13', 'topic_ste_theta6_stock_t1',
       'topic_ste_theta6_stock_t2', 'topic_ste_theta6_stock_t13',
       'topic_ste_theta7_stock_t1', 'topic_ste_theta7_stock_t2',
       'topic_ste_theta7_stock_t13', 'topic_ste_theta8_stock_t1',
       'topic_ste_theta8_stock_t2', 'topic_ste_theta8_stock_t1

## Checking missingness and infinity values

In [9]:
N=51
for i in range(len(Datasets)):
    df = Datasets[i]['df']
    print(Datasets[i]['Name'])
    for col in df.iloc[: , :N].columns:
        print(col,len(df[col]), 'missing:', df[col].isnull().sum(), 'infinity:', np.isinf(df).values.sum())


imfweo
imfweo_ngdp_rpch_tcurrent 158230 missing: 0 infinity: 0
imfweo_ngdp_rpch_tmin1 158230 missing: 0 infinity: 0
imfweo_ngdp_rpch_tplus1 158230 missing: 0 infinity: 0
imfweo_ngdp_rpch_tplus2 158230 missing: 0 infinity: 0
ln_ged_sb_dep 158230 missing: 0 infinity: 0
ln_ged_sb 158230 missing: 0 infinity: 0
gleditsch_ward 158230 missing: 0 infinity: 0
wdi_sp_pop_totl 158230 missing: 0 infinity: 0
decay_ged_sb_5 158230 missing: 0 infinity: 0
decay_ged_os_5 158230 missing: 0 infinity: 0
splag_1_decay_ged_sb_5 158230 missing: 0 infinity: 0
joint_narrow
ln_ged_sb_dep 158230 missing: 0 infinity: 0
gleditsch_ward 158230 missing: 0 infinity: 0
ln_ged_sb 158230 missing: 0 infinity: 0
reign_tenure_months 158230 missing: 0 infinity: 0
wdi_sp_pop_totl 158230 missing: 2242 infinity: 0
wdi_ag_lnd_frst_k2 158230 missing: 2620 infinity: 0
wdi_nv_agr_totl_kn 158230 missing: 5053 infinity: 0
wdi_sh_sta_maln_zs 158230 missing: 30885 infinity: 0
wdi_sl_tlf_totl_fe_zs 158230 missing: 11586 infinity: 0
wdi_

# Specify models in ensemble

In [10]:
from ModelDefinitions import DefineEnsembleModels

ModelList = DefineEnsembleModels('cm')
    
for imodel,model in enumerate(ModelList):
    print(imodel, model['modelname'], model['data_train'])

0 fatalities002_baseline_rf baseline002
1 fatalities002_conflicthistory_rf conflict_ln
2 fatalities002_conflicthistory_gbm conflict_ln
3 fatalities002_conflicthistory_hurdle_lgb conflict_ln
4 fatalities002_conflicthistory_long_xgb conflictlong_ln
5 fatalities002_vdem_hurdle_xgb vdem_short
6 fatalities002_wdi_rf wdi_short
7 fatalities002_topics_rf topics_002
8 fatalities002_topics_xgb topics_002
9 fatalities002_topics_hurdle_lgb topics_002
10 fatalities002_joint_broad_rf joint_broad
11 fatalities002_joint_broad_hurdle_rf joint_broad
12 fatalities002_joint_narrow_xgb joint_narrow
13 fatalities002_joint_narrow_hurdle_xgb joint_narrow
14 fatalities002_joint_narrow_hurdle_lgb joint_narrow
15 fatalities002_all_pca3_xgb all_features
16 fatalities002_aquastat_rf aquastat
17 fatalities002_faostat_rf faostat
18 fatalities002_faoprices_rf faoprices
19 fatalities002_imfweo_rf imfweo
20 fatalities002_Markov_glm joint_narrow
21 fatalities002_Markov_rf joint_narrow


In [11]:
ModelList

[{'modelname': 'fatalities002_baseline_rf',
  'algorithm': XGBRFRegressor(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bytree=None, device=None,
                 early_stopping_rounds=None, enable_categorical=False,
                 eval_metric=None, feature_types=None, gamma=None,
                 grow_policy=None, importance_type=None,
                 interaction_constraints=None, max_bin=None,
                 max_cat_threshold=None, max_cat_to_onehot=None,
                 max_delta_step=None, max_depth=None, max_leaves=None,
                 min_child_weight=None, missing=nan, monotone_constraints=None,
                 multi_strategy=None, n_estimators=300, n_jobs=12,
                 num_parallel_tree=None, objective='reg:squarederror',
                 random_state=None, reg_alpha=None, ...),
  'depvar': 'ln_ged_sb_dep',
  'data_train': 'baseline002',
  'queryset': 'fatalities002_baseline',
  'preprocessing': 'float_it',
  '

In [12]:
document_ensemble(ModelList,'sb')

0 fatalities002_baseline_rf baseline002
1 fatalities002_conflicthistory_rf conflict_ln
2 fatalities002_conflicthistory_gbm conflict_ln
3 fatalities002_conflicthistory_hurdle_lgb conflict_ln
4 fatalities002_conflicthistory_long_xgb conflictlong_ln
5 fatalities002_vdem_hurdle_xgb vdem_short
6 fatalities002_wdi_rf wdi_short
7 fatalities002_topics_rf topics_002
8 fatalities002_topics_xgb topics_002
9 fatalities002_topics_hurdle_lgb topics_002
10 fatalities002_joint_broad_rf joint_broad
11 fatalities002_joint_broad_hurdle_rf joint_broad
12 fatalities002_joint_narrow_xgb joint_narrow
13 fatalities002_joint_narrow_hurdle_xgb joint_narrow
14 fatalities002_joint_narrow_hurdle_lgb joint_narrow
15 fatalities002_all_pca3_xgb all_features
16 fatalities002_aquastat_rf aquastat
17 fatalities002_faostat_rf faostat
18 fatalities002_faoprices_rf faoprices
19 fatalities002_imfweo_rf imfweo
20 fatalities002_Markov_glm joint_narrow
21 fatalities002_Markov_rf joint_narrow


In [13]:
for ds in Datasets:
    df = ds['df']
    print(ds['Name'],df.isna().sum())
    ds['df']=df.fillna(0)

imfweo imfweo_ngdp_rpch_tcurrent    0
imfweo_ngdp_rpch_tmin1       0
imfweo_ngdp_rpch_tplus1      0
imfweo_ngdp_rpch_tplus2      0
ln_ged_sb_dep                0
ln_ged_sb                    0
gleditsch_ward               0
wdi_sp_pop_totl              0
decay_ged_sb_5               0
decay_ged_os_5               0
splag_1_decay_ged_sb_5       0
dtype: int64
joint_narrow ln_ged_sb_dep                      0
gleditsch_ward                     0
ln_ged_sb                          0
reign_tenure_months                0
wdi_sp_pop_totl                 2242
wdi_ag_lnd_frst_k2              2620
wdi_nv_agr_totl_kn              5053
wdi_sh_sta_maln_zs             30885
wdi_sl_tlf_totl_fe_zs          11586
wdi_sm_pop_refg_or              4664
wdi_sp_dyn_imrt_in              2242
wdi_sp_pop_14_fe_zs             2242
wdi_sp_pop_grow                 2242
vdem_v2xcl_dmove               15145
vdem_v2xcl_rol                 15145
vdem_v2xeg_eqdr                15145
vdem_v2xpe_exlpol              154

In [14]:
# Loop that checks whether the model exists, retrains if not, 
# and stores the predictions if they have not been stored before for this run.
# To do: set the data_preprocessing to the function in the model dictionary

level = 'cm'
includeFuture = False
force_rewrite = True
force_retrain = True

from views_runs import Storage, StepshiftedModels
from views_partitioning.data_partitioner import DataPartitioner
from viewser import Queryset, Column
from views_runs import operations
from views_runs.run_result import RunResult
from new_markov import markov

i = 0
for model in ModelList[:]:
    if 'Markov' not in model['modelname']:
        
        modelstore = storage.Storage()
        ct = datetime.now()
        print(i, model['modelname'])
        print('Calibration partition', ct)
        model['Algorithm_text'] = str(model['algorithm'])
        model['RunResult_calib'] = RunResult.retrain_or_retrieve(
                retrain            = force_retrain,
                store              = modelstore,
                partitioner        = DataPartitioner({"calib":calib_partitioner_dict}),
                stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                dataset            = RetrieveFromList(Datasets,model['data_train']),
                queryset_name      = model['queryset'],
                partition_name     = "calib",
                timespan_name      = "train",
                storage_name       = model['modelname'] + '_calib',
                author_name        = "JED",
        )

    #    model['predstore_calib'] = level +  '_' + model['modelname'] + '_calib'
        ct = datetime.now()
        if force_rewrite:
            print(model['predstore_calib'], ', run',  run_id, 'force_rewrite=True, predicting')
            predictions_calib = model['RunResult_calib'].run.predict("calib","predict", model['RunResult_calib'].data)
            predictions_calib.forecasts.set_run(run_id)
            predictions_calib.forecasts.to_store(name=model['predstore_calib'],overwrite=True)
        else:
            print('Trying to retrieve predictions', ct)
            try:
                predictions_calib = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_calib'])
            except KeyError:
                print(model['predstore_calib'], ', run',  run_id, 'does not exist, predicting')
                predictions_calib = model['RunResult_calib'].run.predict("calib","predict", model['RunResult_calib'].data)
                predictions_calib.forecasts.set_run(run_id)
                predictions_calib.forecasts.to_store(name=model['predstore_calib'])

        ct = datetime.now()
        print('Test partition', ct)
        modelstore = storage.Storage()
        model['RunResult_test'] = RunResult.retrain_or_retrieve(
                retrain            = force_retrain,
                store              = modelstore,
                partitioner        = DataPartitioner({"test":test_partitioner_dict}),
                stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                dataset            = RetrieveFromList(Datasets,model['data_train']),
                queryset_name      = model['queryset'],
                partition_name     = "test",
                timespan_name      = "train",
                storage_name       = model['modelname'] + '_test',
                author_name        = "JED",
        )
        ct = datetime.now()
        
        if force_rewrite:
            print(model['predstore_test'], ', run',  run_id, 'force_rewrite=True, predicting')
            predictions_test = model['RunResult_test'].run.predict("test","predict", model['RunResult_test'].data)
            predictions_test.forecasts.set_run(run_id)
            predictions_test.forecasts.to_store(name=model['predstore_test'],overwrite=True)
        else:
            print('Trying to retrieve predictions', ct)
    #    model['predstore_test'] = level +  '_' + model['modelname'] + '_test'
            try:
                predictions_test = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_test'])
            except KeyError:
                print(model['predstore_test'], ', run', run_id, 'does not exist, predicting')
                predictions_test = model['RunResult_test'].run.predict("test","predict",model['RunResult_test'].data)
                predictions_test.forecasts.set_run(run_id)
                predictions_test.forecasts.to_store(name=model['predstore_test'])
        # Predictions for true future
        if includeFuture:
            ct = datetime.now()
            print('Future', ct)
            modelstore = storage.Storage()
            model['RunResult_future'] = RunResult.retrain_or_retrieve(
                    retrain            = force_retrain,
                    store              = modelstore,
                    partitioner        = DataPartitioner({"test":future_partitioner_dict}),
                    stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                    dataset            = RetrieveFromList(Datasets,model['data_train']),
                    queryset_name      = model['queryset'],
                    partition_name     = "test",
                    timespan_name      = "train",
                    storage_name       = model['modelname'] + '_future',
                    author_name        = "JED",
            )
            ct = datetime.now()
            
            if force_rewrite:
                print(model['predstore_future'], ', run',  run_id, 'force_rewrite=True, predicting')
                predictions_future = model['RunResult_future'].run.predict(EndOfHistory, model['RunResult_future'].data)
                predictions_future.forecasts.set_run(run_id)
                predictions_future.forecasts.to_store(name=model['predstore_future'],overwrite=True)
            else:
                print('Trying to retrieve predictions', ct)
                model['predstore_future'] = level +  '_' + model['modelname'] + '_f' + str(EndOfHistory)
                predictions_future.forecasts.to_store(name=model['predstore_future'])  
                
    else:
        modelstore = storage.Storage()
        ct = datetime.now()
        print(i, model['modelname'])
        print('Calibration partition', ct)
        model['Algorithm_text'] = str(model['algorithm'])
        print('Trying to retrieve predictions', ct)
        if force_retrain:
            print(model['predstore_calib'], ', run',  run_id, 'force_retrain = True, predicting')
            predictions_calib = markov.compute_markov(calib_partitioner_dict, EndOfHistory, model['depvar'], 'calib', model['algorithm'])
            predictions_calib.forecasts.set_run(run_id)
            predictions_calib.forecasts.to_store(name=model['predstore_calib'],overwrite=True)
        else:
            try:
                predictions_calib = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_calib'])
            except KeyError:
                print(model['predstore_calib'], ', run',  run_id, 'does not exist, predicting')
                predictions_calib = markov.compute_markov(calib_partitioner_dict, EndOfHistory, model['depvar'], 'calib', model['algorithm'])
                predictions_calib.forecasts.set_run(run_id)
                predictions_calib.forecasts.to_store(name=model['predstore_calib'],overwrite=True)
                
        ct = datetime.now()
        print('Test partition', ct)
        modelstore = storage.Storage()
        if force_retrain:
            print(model['predstore_test'], ', run', run_id, 'force_retrain=True, predicting')
            predictions_test = markov.compute_markov(test_partitioner_dict, EndOfHistory, model['depvar'], 'test', model['algorithm'])
            predictions_test.forecasts.set_run(run_id)
            predictions_test.forecasts.to_store(name=model['predstore_test'],overwrite=True)
        else:
            try:
                predictions_test = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_test'])
            except KeyError:
                print(model['predstore_test'], ', run', run_id, 'does not exist, predicting')
                predictions_test = markov.compute_markov(test_partitioner_dict, EndOfHistory, model['depvar'], 'test', model['algorithm'])
                predictions_test.forecasts.set_run(run_id)
                predictions_test.forecasts.to_store(name=model['predstore_test'],overwrite=True)
                
        if includeFuture:
            ct = datetime.now()
            print('Future', ct)
            modelstore = storage.Storage()
            print('Trying to retrieve predictions', ct)
            model['predstore_future'] = level +  '_' + model['modelname'] + '_f' + str(EndOfHistory)
            if force_retrain:
                print(model['predstore_future'], ', run', run_id, 'force_retrain=True, predicting')
                predictions_future = markov.compute_markov(test_partitioner_dict, EndOfHistory, model['depvar'], 'future', model['algorithm'])
                predictions_future.forecasts.set_run(run_id)
                predictions_future.forecasts.to_store(name=model['predstore_future'],overwrite=True)
            else:
                try:
                    predictions_future = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_future'])
                except KeyError:
                    print(model['predstore_future'], ', run', run_id, 'does not exist, predicting')
                    predictions_future = markov.compute_markov(test_partitioner_dict, EndOfHistory, model['depvar'], 'future', model['algorithm'])
                    predictions_future.forecasts.set_run(run_id)
                    predictions_future.forecasts.to_store(name=model['predstore_future'],overwrite=True)  
                            
        print('**************************************************************')
    i = i + 1

print('All done')

0 fatalities002_baseline_rf
Calibration partition 2024-03-28 14:38:39.265577
 * == Performing a run: "fatalities002_baseline_rf_calib" == * 
Model object named "fatalities002_baseline_rf_calib" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_baseline_rf_calib"
Training model(s)...
Storing "fatalities002_baseline_rf_calib"
cm_fatalities002_baseline_rf_calib , run Fatalities002 force_rewrite=True, predicting
Test partition 2024-03-28 14:39:35.819056
 * == Performing a run: "fatalities002_baseline_rf_test" == * 
Model object named "fatalities002_baseline_rf_test" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_baseline_rf_test"
Training model(s)...
Storing "fatalities002_baseline_rf_test"
cm_fatalities002_baseline_rf_test , run Fatalities002 force_rewrite=True, predicting
1 fatalities002_conflicthistory_rf
Calibration partition 2024-03-28 14:40:35.909058
 * == Performing a run: "fatalities002_conflicthistory_rf_cali

Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_conflicthistory_rf_calib"
cm_fatalities002_conflicthistory_rf_calib , run Fatalities002 force_rewrite=True, predicting
Test partition 2024-03-28 14:41:46.165731
 * == Performing a run: "fatalities002_conflicthistory_rf_test" == * 
Model object named "fatalities002_conflicthistory_rf_test" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_conflicthistory_rf_test"
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_conflicthistory_rf_test"
cm_fatalities002_conflicthistory_rf_test , run Fatalities002 force_rewrite=True, predicting
2 fatalities002_conflicthistory_gbm
Calibration partition 2024-03-28 14:42:59.023648
 * == Performing a run: "fatalities002_conflicthistory_gbm_calib" == * 
Model object named "fatalities002_conflicthistory_gbm_calib" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_conflicthistory_gbm_calib"
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_conflicthistory_gbm_calib"
cm_fatalities002_conflicthistory_gbm_calib , run Fatalities002 force_rewrite=True, predicting
Test partition 2024-03-28 14:58:36.495449
 * == Performing a run: "fatalities002_conflicthistory_gbm_test" == * 
Model object named "fatalities002_conflicthistory_gbm_test" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_conflicthistory_gbm_test"
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_conflicthistory_gbm_test"
cm_fatalities002_conflicthistory_gbm_test , run Fatalities002 force_rewrite=True, predicting
3 fatalities002_conflicthistory_hurdle_lgb
Calibration partition 2024-03-28 15:18:01.900232
 * == Performing a run: "fatalities002_conflicthistory_hurdle_lgb_calib" == * 
Model object named "fatalities002_conflicthistory_hurdle_lgb_calib" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_conflicthistory_hurdle_lgb_calib"
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


[LightGBM] [Info] Number of positive: 5545, number of negative: 41061
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002928 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7025
[LightGBM] [Info] Number of data points in the train set: 46606, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118976 -> initscore=-2.002162
[LightGBM] [Info] Start training from score -2.002162
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6260
[LightGBM] [Info] Number of data points in the train set: 5545, number of used features: 29
[LightGBM] [Info] Start training from score 3.391455
[LightGBM] [Info] Number of positive: 5519, number of negative: 40881
[LightGBM] [Info] Auto-choosing 

Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


[LightGBM] [Info] Number of positive: 6659, number of negative: 49108
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002804 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7074
[LightGBM] [Info] Number of data points in the train set: 55767, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119408 -> initscore=-1.998053
[LightGBM] [Info] Start training from score -1.998053
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000750 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6479
[LightGBM] [Info] Number of data points in the train set: 6659, number of used features: 29
[LightGBM] [Info] Start training from score 3.416581
[LightGBM] [Info] Number of positive: 6631, number of negative: 48928
[LightGBM] [Info] Auto-choosing 

Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_conflicthistory_long_xgb_calib"
cm_fatalities002_conflicthistory_long_xgb_calib , run Fatalities002 force_rewrite=True, predicting
Test partition 2024-03-28 15:20:39.725886
 * == Performing a run: "fatalities002_conflicthistory_long_xgb_test" == * 
Model object named "fatalities002_conflicthistory_long_xgb_test" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_conflicthistory_long_xgb_test"
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_conflicthistory_long_xgb_test"
cm_fatalities002_conflicthistory_long_xgb_test , run Fatalities002 force_rewrite=True, predicting
5 fatalities002_vdem_hurdle_xgb
Calibration partition 2024-03-28 15:21:26.507006
 * == Performing a run: "fatalities002_vdem_hurdle_xgb_calib" == * 
Model object named "fatalities002_vdem_hurdle_xgb_calib" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_vdem_hurdle_xgb_calib"
Training model(s)...
Storing "fatalities002_vdem_hurdle_xgb_calib"
cm_fatalities002_vdem_hurdle_xgb_calib , run Fatalities002 force_rewrite=True, predicting
Test partition 2024-03-28 15:22:31.248758
 * == Performing a run: "fatalities002_vdem_hurdle_xgb_test" == * 
Model object named "fatalities002_vdem_hurdle_xgb_test" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_vdem_hurdle_xgb_test"
Training model(s)...
Storing "fatalities002_vdem_hurdle_xgb_test"
cm_fatalities002_vdem_hurdle_xgb_test ,

Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_joint_broad_rf_calib"
cm_fatalities002_joint_broad_rf_calib , run Fatalities002 force_rewrite=True, predicting
Test partition 2024-03-28 15:39:09.439201
 * == Performing a run: "fatalities002_joint_broad_rf_test" == * 
Model object named "fatalities002_joint_broad_rf_test" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_joint_broad_rf_test"
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_joint_broad_rf_test"
cm_fatalities002_joint_broad_rf_test , run Fatalities002 force_rewrite=True, predicting
11 fatalities002_joint_broad_hurdle_rf
Calibration partition 2024-03-28 15:41:23.943835
 * == Performing a run: "fatalities002_joint_broad_hurdle_rf_calib" == * 
Model object named "fatalities002_joint_broad_hurdle_rf_calib" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_joint_broad_hurdle_rf_calib"
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_joint_broad_hurdle_rf_calib"
cm_fatalities002_joint_broad_hurdle_rf_calib , run Fatalities002 force_rewrite=True, predicting
Test partition 2024-03-28 15:44:19.458694
 * == Performing a run: "fatalities002_joint_broad_hurdle_rf_test" == * 
Model object named "fatalities002_joint_broad_hurdle_rf_test" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_joint_broad_hurdle_rf_test"
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_joint_broad_hurdle_rf_test"
cm_fatalities002_joint_broad_hurdle_rf_test , run Fatalities002 force_rewrite=True, predicting
12 fatalities002_joint_narrow_xgb
Calibration partition 2024-03-28 15:47:29.871456
 * == Performing a run: "fatalities002_joint_narrow_xgb_calib" == * 
Model object named "fatalities002_joint_narrow_xgb_calib" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_joint_narrow_xgb_calib"
Training model(s)...
Storing "fatalities002_joint_narrow_xgb_calib"
cm_fatalities002_joint_narrow_xgb_calib , run Fatalities002 force_rewrite=True, predicting
Test partition 2024-03-28 15:48:43.107608
 * == Performing a run: "fatalities002_joint_narrow_xgb_test" == * 
Model object named "fatalities002_joint_narrow_xgb_test" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_joint_narrow_xgb_test"
Training model(s)...
Storing "fatalities002_joint_narrow_xgb_test"
cm_fatalities002_joint_narrow_xgb_

Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_all_pca3_xgb_calib"
cm_fatalities002_all_pca3_xgb_calib , run Fatalities002 force_rewrite=True, predicting
Test partition 2024-03-28 15:55:52.503601
 * == Performing a run: "fatalities002_all_pca3_xgb_test" == * 
Model object named "fatalities002_all_pca3_xgb_test" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_all_pca3_xgb_test"
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_all_pca3_xgb_test"
cm_fatalities002_all_pca3_xgb_test , run Fatalities002 force_rewrite=True, predicting
16 fatalities002_aquastat_rf
Calibration partition 2024-03-28 15:58:08.022987
 * == Performing a run: "fatalities002_aquastat_rf_calib" == * 
Model object named "fatalities002_aquastat_rf_calib" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_aquastat_rf_calib"
Training model(s)...
Storing "fatalities002_aquastat_rf_calib"
cm_fatalities002_aquastat_rf_calib , run Fatalities002 force_rewrite=True, predicting
Test partition 2024-03-28 15:59:17.204642
 * == Performing a run: "fatalities002_aquastat_rf_test" == * 
Model object named "fatalities002_aquastat_rf_test" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_aquastat_rf_test"
Training model(s)...
Storing "fatalities002_aquastat_rf_test"
cm_fatalities002_aquastat_rf_test , run Fatalities002 force_rewrite=True, predicting
17 fatalities002_

Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_faoprices_rf_calib"
cm_fatalities002_faoprices_rf_calib , run Fatalities002 force_rewrite=True, predicting
Test partition 2024-03-28 16:04:39.156789
 * == Performing a run: "fatalities002_faoprices_rf_test" == * 
Model object named "fatalities002_faoprices_rf_test" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_faoprices_rf_test"
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_faoprices_rf_test"
cm_fatalities002_faoprices_rf_test , run Fatalities002 force_rewrite=True, predicting
19 fatalities002_imfweo_rf
Calibration partition 2024-03-28 16:05:44.344149
 * == Performing a run: "fatalities002_imfweo_rf_calib" == * 
Model object named "fatalities002_imfweo_rf_calib" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_imfweo_rf_calib"
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_imfweo_rf_calib"
cm_fatalities002_imfweo_rf_calib , run Fatalities002 force_rewrite=True, predicting
Test partition 2024-03-28 16:06:40.382123
 * == Performing a run: "fatalities002_imfweo_rf_test" == * 
Model object named "fatalities002_imfweo_rf_test" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_imfweo_rf_test"
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_imfweo_rf_test"
cm_fatalities002_imfweo_rf_test , run Fatalities002 force_rewrite=True, predicting
20 fatalities002_Markov_glm
Calibration partition 2024-03-28 16:07:40.448323
Trying to retrieve predictions 2024-03-28 16:07:40.448323
cm_fatalities002_Markov_glm_calib , run Fatalities002 force_retrain = True, predicting


100%|██████████| 3.81M/3.81M [00:01<00:00, 2.18MB/s]


Queryset fatalities002_joint_narrow read successfully 


── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.2.0
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract


Attaching package: ‘arrow’

The following object is masked from ‘package:magrittr’:

    is_in

The following object is masked from ‘package:lubridate’:

    duration

The following object is masked from ‘package:utils’:

    timestamp




 All required packages installed 

 Packages loaded, starting script 





 Rscript finished! 
Test partition 2024-03-28 16:18:43.238126
cm_fatalities002_Markov_glm_test , run Fatalities002 force_retrain=True, predicting


100%|██████████| 3.81M/3.81M [00:01<00:00, 2.11MB/s]


Queryset fatalities002_joint_narrow read successfully 


── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.2.0
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract


Attaching package: ‘arrow’

The following object is masked from ‘package:magrittr’:

    is_in

The following object is masked from ‘package:lubridate’:

    duration

The following object is masked from ‘package:utils’:

    timestamp




 All required packages installed 

 Packages loaded, starting script 





 Rscript finished! 
**************************************************************
21 fatalities002_Markov_rf
Calibration partition 2024-03-28 16:31:23.332600
Trying to retrieve predictions 2024-03-28 16:31:23.332600
cm_fatalities002_Markov_rf_calib , run Fatalities002 force_retrain = True, predicting


100%|██████████| 3.81M/3.81M [00:01<00:00, 1.98MB/s]


Queryset fatalities002_joint_narrow read successfully 


── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.2.0
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract


Attaching package: ‘arrow’

The following object is masked from ‘package:magrittr’:

    is_in

The following object is masked from ‘package:lubridate’:

    duration

The following object is masked from ‘package:utils’:

    timestamp




 All required packages installed 

 Packages loaded, starting script 





 Rscript finished! 
Test partition 2024-03-28 16:36:07.426222
cm_fatalities002_Markov_rf_test , run Fatalities002 force_retrain=True, predicting


100%|██████████| 3.81M/3.81M [00:01<00:00, 3.61MB/s]


Queryset fatalities002_joint_narrow read successfully 


── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.2.0
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract


Attaching package: ‘arrow’

The following object is masked from ‘package:magrittr’:

    is_in

The following object is masked from ‘package:lubridate’:

    duration

The following object is masked from ‘package:utils’:

    timestamp




 All required packages installed 

 Packages loaded, starting script 





 Rscript finished! 
**************************************************************
All done


## Notes on training time for the various algorithms:

In [15]:
#These are calculated in minutes for the hh20 feature set (with about 40 features), for all 36 steps, calibration (c) and test (t) partitions, also include generating predictions, and are approximate:

#nj=12 (number of threads)
#scikit random forest:        21:13 (c), 26:20 (t) RandomForestRegressor(n_estimators=200, n_jobs=nj)
#XGB random forest:           06:02 (c), 07:51 (t) XGBRFRegressor(n_estimators=300,n_jobs=nj)
#scikit gbm:                  13:59 (c), 15:55 (t) GradientBoostingRegressor(), 
#scikit hurdle random forest: 07:32 (c), 09:49 (t) For both clf and reg: (n_estimators=200, n_jobs=nj)
#XGB hurdle xgb:              01:26 (c), 01:32 (t) For both clf and reg:                n_estimators=200,tree_method='hist',n_jobs=nj)
#scikit histgbm:              01:17 (c), 01:20 (t) HistGradientBoostingRegressor(max_iter=200)
#XGB xgb:                     01:00 (c), 01:04 (t) XGBRegressor(n_estimators=200,tree_method='hist',n_jobs=nj)
#lightgbm gbm:                00:25 (c), --    (t) LGBMRegressor(n_estimators=100,num_threads=8)

# Various helper functions and tools....

# Retrieving external forecasts

In [16]:
## Retrieve David's Markov models
## To do: rewrite the model dictionary to the new, slimmer version.
#DRList = []


#model = {
#    'modelname':   'fatalities002_Markov_glm',
#    'algorithm': [],
#    'depvar': "ln_ged_sb_dep",
#    'data_train':      'joint_narrow',
#    'queryset': 'fatalities002_joint_narrow',
#}
#DRList.append(model)

#model = {
#    'modelname':   'fatalities002_Markov_rf',
#    'algorithm': [],
#    'depvar': "ln_ged_sb_dep",
#    'data_train':      'joint_narrow',
#    'queryset': 'fatalities002_joint_narrow',
#}

#DRList.append(model)



In [17]:
#path = f'/Users/{os.getlogin()}/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/cm/preds/'

#DRList[0]['predictions_file_calib'] = path + 'markov_jointnarrow_ss_glm_calib.parquet'
#DRList[0]['predictions_file_test'] = path + 'markov_jointnarrow_ss_glm_test.parquet'
#DRList[0]['predictions_file_future'] = path + 'vmm_glm_hh20_517.csv'

#DRList[1]['predictions_file_calib'] = path + 'markov_jointnarrow_ss_rf_calib.parquet'
#DRList[1]['predictions_file_test'] = path + 'markov_jointnarrow_ss_rf_test.parquet'
#DRList[1]['predictions_file_future'] = path + 'vmm_rf_hh20_517.csv'

In [18]:
## Storing Markov models in central storage
## Retrieving dependent variable

#print('Adding depvar - CHECK FILES BEING USED FROM STORAGE ARE SUITABLE!')
#target_calib = pd.DataFrame.forecasts.read_store('cm_fatalities002_conflicthistory_rf_calib', run=run_id)['ln_ged_sb_dep']
#target_test = pd.DataFrame.forecasts.read_store('cm_fatalities002_conflicthistory_rf_test', run=run_id)['ln_ged_sb_dep']
#level = 'cm'
#for model in DRList:
#    df_calib = pd.read_parquet(model['predictions_file_calib'])
##    df_calib.rename(columns={'target_month_id':'month_id'}, inplace=True)
##    df_calib.set_index(['month_id', 'country_id'], inplace=True)

#    df_test = pd.read_parquet(model['predictions_file_test'])
##    df_test.rename(columns={'target_month_id':'month_id'}, inplace=True)
##    df_calib.set_index(['month_id', 'country_id'], inplace=True)

##    df_future = pd.read_csv(model['predictions_file_future'],index_col=['month_id','country_id'])
#    df_calib['ln_ged_sb_dep'] = target_calib
#    df_test['ln_ged_sb_dep'] = target_test
##    df_future['ln_ged_sb_dep'] = np.nan # Empty dependent variable column for consistency/required by prediction storage function
#    stored_modelname = level + '_' + model['modelname'] + '_calib'
#    df_calib.forecasts.set_run(run_id)
#    df_calib.forecasts.to_store(name=stored_modelname, overwrite=True)
#    stored_modelname = level + '_' + model['modelname'] + '_test'
#    df_test.forecasts.set_run(run_id)
#    df_test.forecasts.to_store(name=stored_modelname, overwrite=True)    