
# ViEWS 3 constituent models 
## ViEWS production system, cm level


This notebook trains a set of regression models for use in the monthly updated ViEWS predicting fatalities ensemble

The notebook does the following: 
1. Retrieves data through querysets and stores in DataSets, a list of dictionaries
2. Specifies the metadata of a number of models, stores in ModelList, a list of dictionaries
3. Trains the models in ModelList, stores the trained objects in model storage and prediction storage
4. Saves part of ModelList as csv and the rest as pickles


## Importing modules

In [1]:
%load_ext autoreload
%autoreload 2

In [37]:
from settings import Mydropbox, username, overleafpath

from data import make_queryset_documentation, get_data

# Basics
import numpy as np
import pandas as pd


from xgboost import XGBRegressor
from xgboost import XGBClassifier
from xgboost import XGBRFRegressor, XGBRFClassifier

from lightgbm import LGBMClassifier, LGBMRegressor


from views_runs import storage
from views_forecasts.extensions import *


from FetchData import RetrieveFromList, document_queryset, ReturnQsList, document_ensemble
from ViewsEstimators import *


## Check common parameters

In [28]:
print('User:', username)
print('Setting Mydropbox to', Mydropbox)
print('Overleaf path set to', overleafpath)

User: root
Setting Mydropbox to /Users/root/Dropbox (ViEWS)/ViEWS
Overleaf path set to /Users/root/Dropbox (ViEWS)/Apps/Overleaf/VIEWS documentation Fatalities003/


## Get data

In [35]:
make_queryset_documentation(level="cm")

 .    fatalities003_baseline; A dataset with 6 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_baseline_nonlog; A dataset with 6 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_topics_stub; A dataset with 62 columns, with data between t 1 and 852. (213 units)
fatalities003_aquastat_stub; A dataset with 62 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_cm_conflict_history_stub; A dataset with 24 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_cm_conflict_history_ext; A dataset with 33 columns, with data between t = 1 and 852. (213 units)
 .    fatalities003_vdem_short_stub; A dataset with 57 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_wdi_short_stub; A dataset with 26 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_joint_narrow; A dataset with 39 columns, with data between t = 1 and 852. (213 units)
 .    fatalities003_joint_broad_stub; A d

In [38]:
datasets = get_data(level = "cm")

 .    fatalities003_baseline; A dataset with 6 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_baseline_nonlog; A dataset with 6 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_topics_stub; A dataset with 62 columns, with data between t 1 and 852. (213 units)
fatalities003_aquastat_stub; A dataset with 62 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_cm_conflict_history_stub; A dataset with 24 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_cm_conflict_history_ext; A dataset with 33 columns, with data between t = 1 and 852. (213 units)
 .    fatalities003_vdem_short_stub; A dataset with 57 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_wdi_short_stub; A dataset with 26 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_joint_narrow; A dataset with 39 columns, with data between t = 1 and 852. (213 units)
 .    fatalities003_joint_broad_stub; A d

In [52]:
ix = 2
print(qslist[ix])

name='fatalities003_conflict_history' loa='country_month' themes=['fatalities003'] description='Predicting ln(fatalities), cm level\n    \n                             Queryset with baseline and first set of conflict history features\n    \n                             ' operations=[[RenameOperation(namespace='trf', name='util.rename', arguments=['ged_sb_dep']), TransformOperation(namespace='trf', name='missing.fill', arguments=[]), DatabaseOperation(namespace='base', name='ged2_cm.ged_sb_best_sum_nokgi', arguments=['values'])], [RenameOperation(namespace='trf', name='util.rename', arguments=['ged_sb']), TransformOperation(namespace='trf', name='missing.fill', arguments=[]), DatabaseOperation(namespace='base', name='ged2_cm.ged_sb_best_sum_nokgi', arguments=['values'])], [RenameOperation(namespace='trf', name='util.rename', arguments=['decay_ged_sb_5']), TransformOperation(namespace='trf', name='missing.replace_na', arguments=[]), TransformOperation(namespace='trf', name='temporal.deca

In [54]:
datasets[ix]["df"]

Unnamed: 0_level_0,Unnamed: 1_level_0,gleditsch_ward,ged_sb_dep,ged_sb,ln_ged_ns,ln_ged_os,ln_acled_sb,ln_acled_sb_count,ln_acled_os,wdi_sp_pop_totl,wdi_sm_pop_netm,...,topic10_conflict_t1_stock,topic11_diplomacy_t1_stock,topic12_power_t1_stock,topic13_sports_t1_stock,topic14_judiciary_t1_stock,splag_topic2_sanctions_t1_stock,splag_topic10_conflict_t1_stock,splag_topic11_diplomacy_t1_stock,splag_topic12_power_t1_stock,splag_topic14_judiciary_t1_stock
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,110.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,778176.0,-15059.0,...,0.113060,2.74194,2.12884,60.976009,0.96870,1.29592,6.081870,6.47591,21.11825,6.40065
1,2,115.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,375112.0,-7390.0,...,0.000000,0.00000,0.00000,0.000000,0.00000,1.20244,4.663720,5.43013,11.89872,3.41727
1,3,52.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1127852.0,-10709.0,...,3.829780,8.29672,5.60053,0.854730,2.32960,0.00000,0.000000,0.00000,0.00000,0.00000
1,4,101.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,15210443.0,244.0,...,1.531210,3.78772,11.34837,3.090630,3.95208,2.27568,19.162361,12.22174,27.97641,14.04929
1,5,990.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,164905.0,-3270.0,...,0.000000,0.00000,0.00000,0.000000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852,242,510.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,63588334.0,-4865.0,...,27.327141,1.20827,10.34431,0.231970,7.27230,0.00000,0.000000,0.00000,0.00000,0.00000
852,243,600.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,37076584.0,-46242.0,...,10.052680,7.05330,3.76188,1.494370,20.13751,0.00000,0.000000,0.00000,0.00000,0.00000
852,244,435.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,4614974.0,-1494.0,...,2.653550,1.01474,3.45133,2.918570,6.39670,0.00000,0.000000,0.00000,0.00000,0.00000
852,245,625.0,510.0,510.0,0.0,1.386294,0.0,0.0,0.0,45657202.0,-15223.0,...,11.256440,1.63621,2.05500,0.972050,2.53410,0.00000,0.000000,0.00000,0.00000,0.00000


In [47]:
#import pickle

#with open('datasets.pkl', 'wb') as f:
#    pickle.dump(datasets, f)

In [48]:
#with open('datasets.pkl', 'rb') as f:
#    loaded_dict = pickle.load(f)

In [7]:
#FixedFirstSplitRegression(ones_name='LGBMClassifier', zeros_name='LGBMRegressor', ones_indicator = '')

# Generating predictions
Using the ViEWS3 partitioning/stepshifting syntax. Training models for A: calibration partition and B: test partition, to test out some calibration routines. Most models trained with ln_ged_sb_best as outcome.

In [13]:
from views_runs import ModelMetadata 
help(ModelMetadata)

Help on class ModelMetadata in module views_schema.models:

class ModelMetadata(pydantic.main.BaseModel)
 |  ModelMetadata(*, author: str, queryset_name: str, train_start: int, train_end: int, steps: Optional[List[int]] = None, training_date: datetime.datetime) -> None
 |  
 |  ModelMetadata
 |  
 |  Data used to organize model objects.
 |  
 |  parameters:
 |      author (str): Name of the user that authored the model object.
 |      queryset_name (str): Name of the queryset used to train the model
 |      train_start (int): Month identifier for training start date
 |      train_start (int): Month identifier for training end date
 |      training_date (datetime.datetime): Timestamp for training date (use datetime.datetime.now())
 |  
 |  example:
 |  
 |      # Instantiate the class with values
 |  
 |      my_metadata = ModelMetadata(
 |          author = "my_name",
 |          queryset_name = "my_queryset",
 |          train_start = 1,
 |          train_end = 300,
 |          steps 

## Checking missingness and infinity values

In [40]:
N=51
for i in range(len(Datasets)):
    df = Datasets[i]['df']
    print(Datasets[i]['Name'])
    for col in df.iloc[: , :N].columns:
        if df[col].isnull().sum() > 0 or np.isinf(df).values.sum() > 0:
            print(col,len(df[col]), 'missing:', df[col].isnull().sum(), 'infinity:', np.isinf(df).values.sum())


joint_narrow
vdem_short
joint_broad
topics_003
all_features
baseline003
conflict_ln
conflictlong_ln
wdi_short
pca_all
pca_topics
pca_vdem
pca_wdi


# Specify models in ensemble

In [15]:
from ModelDefinitions import DefineEnsembleModels

ModelList = DefineEnsembleModels('cm')
    
for imodel,model in enumerate(ModelList):
    print(imodel, model['modelname'], model['data_train'])

0 fatalities003_nl_baseline_rf baseline003
1 fatalities003_nl_conflicthistory_rf conflict_ln
2 fatalities003_nl_conflicthistory_hurdle_lgb conflict_ln
3 fatalities003_nl_conflicthistory_long_xgb conflictlong_ln
4 fatalities003_nl_vdem_hurdle_xgb vdem_short
5 fatalities003_nl_wdi_rf wdi_short
6 fatalities003_nl_topics_rf topics_003
7 fatalities003_nl_topics_xgb topics_003
8 fatalities003_nl_topics_hurdle_lgb topics_003
9 fatalities003_nl_joint_broad_rf joint_broad
10 fatalities003_nl_joint_broad_hurdle_rf joint_broad
11 fatalities003_joint_narrow_xgb joint_narrow
12 fatalities003_nl_joint_narrow_hurdle_xgb joint_narrow
13 fatalities003_nl_joint_narrow_hurdle_lgb joint_narrow
14 fatalities003_nl_all_pca3_xgb all_features


In [16]:
ModelList

[{'modelname': 'fatalities003_nl_baseline_rf',
  'algorithm': XGBRFRegressor(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bytree=None,
                 early_stopping_rounds=None, enable_categorical=False,
                 eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
                 grow_policy=None, importance_type=None,
                 interaction_constraints=None, max_bin=None,
                 max_cat_threshold=None, max_cat_to_onehot=None,
                 max_delta_step=None, max_depth=None, max_leaves=None,
                 min_child_weight=None, missing=nan, monotone_constraints=None,
                 n_estimators=300, n_jobs=12, num_parallel_tree=None,
                 objective='reg:squarederror', predictor=None, random_state=None,
                 reg_alpha=None, ...),
  'depvar': 'ged_sb_dep',
  'data_train': 'baseline003',
  'queryset': 'fatalities003_baseline',
  'preprocessing': 'float_it',
  'level

In [17]:
outcome = 'sb'
EnsembleMetaData_df = document_ensemble(ModelList,outcome)
if username == 'havardhegre1':
    filename = overleafpath + f'Tables/Evaluation/Ensemble_{outcome}.md'
    EnsembleMetaData_df.to_markdown(index=False, buf=filename)

0 fatalities003_nl_baseline_rf baseline003


In [18]:
# Loop that checks whether the model exists, retrains if not, 
# and stores the predictions if they have not been stored before for this run.
# To do: set the data_preprocessing to the function in the model dictionary

level = 'cm'
includeFuture = False

from views_runs import Storage, StepshiftedModels
from views_partitioning.data_partitioner import DataPartitioner
from viewser import Queryset, Column
from views_runs import operations
from views_runs.run_result import RunResult

i = 0
for model in ModelList:
    if model['algorithm'] != 'Rscript':
        force_retrain = False
        modelstore = storage.Storage()
        ct = datetime.now()
        print(i, model['modelname'])
        print('Calibration partition', ct)
        model['Algorithm_text'] = str(model['algorithm'])
        model['RunResult_calib'] = RunResult.retrain_or_retrieve(
                retrain            = force_retrain,
                store              = modelstore,
                partitioner        = DataPartitioner({"calib":calib_partitioner_dict}),
                stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                dataset            = RetrieveFromList(Datasets,model['data_train']),
                queryset_name      = model['queryset'],
                partition_name     = "calib",
                timespan_name      = "train",
                storage_name       = model['modelname'] + '_calib',
                author_name        = "HH",
        )

    #    model['predstore_calib'] = level +  '_' + model['modelname'] + '_calib'
        ct = datetime.now()
        print('Trying to retrieve predictions', ct)
        try:
            predictions_calib = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_calib'])
        except KeyError:
            print(model['predstore_calib'], ', run',  run_id, 'does not exist, predicting')
            predictions_calib = model['RunResult_calib'].run.predict("calib","predict", model['RunResult_calib'].data)
            predictions_calib.forecasts.set_run(run_id)
            predictions_calib.forecasts.to_store(name=model['predstore_calib'])

        ct = datetime.now()
        print('Test partition', ct)
        modelstore = storage.Storage()
        model['RunResult_test'] = RunResult.retrain_or_retrieve(
                retrain            = force_retrain,
                store              = modelstore,
                partitioner        = DataPartitioner({"test":test_partitioner_dict}),
                stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                dataset            = RetrieveFromList(Datasets,model['data_train']),
                queryset_name      = model['queryset'],
                partition_name     = "test",
                timespan_name      = "train",
                storage_name       = model['modelname'] + '_test',
                author_name        = "HH",
        )
        ct = datetime.now()
        print('Trying to retrieve predictions', ct)
    #    model['predstore_test'] = level +  '_' + model['modelname'] + '_test'
        try:
            predictions_test = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_test'])
        except KeyError:
            print(model['predstore_test'], ', run', run_id, 'does not exist, predicting')
            predictions_test = model['RunResult_test'].run.predict("test","predict",model['RunResult_test'].data)
            predictions_test.forecasts.set_run(run_id)
            predictions_test.forecasts.to_store(name=model['predstore_test'])
        # Predictions for true future
        if includeFuture:
            ct = datetime.now()
            print('Future', ct)
            modelstore = storage.Storage()
            model['RunResult_future'] = RunResult.retrain_or_retrieve(
                    retrain            = force_retrain,
                    store              = modelstore,
                    partitioner        = DataPartitioner({"test":future_partitioner_dict}),
                    stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                    dataset            = RetrieveFromList(Datasets,model['data_train']),
                    queryset_name      = model['queryset'],
                    partition_name     = "test",
                    timespan_name      = "train",
                    storage_name       = model['modelname'] + '_future',
                    author_name        = "HH",
            )
            ct = datetime.now()
            print('Trying to retrieve predictions', ct)
            model['predstore_future'] = level +  '_' + model['modelname'] + '_f' + str(FutureStart)
            try:
                predictions_future = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_future'])
            except KeyError:
                print(model['predstore_future'], ', run', run_id, 'does not exist, predicting')
                predictions_future = model['RunResult_future'].run.future_point_predict(FutureStart,model['RunResult_future'].data)
                predictions_future.forecasts.set_run(run_id)
                predictions_future.forecasts.to_store(name=model['predstore_future'])  
        print('**************************************************************')
    i = i + 1

print('All done')

0 fatalities003_nl_baseline_rf
Calibration partition 2023-11-20 13:49:40.614977
 * == Performing a run: "fatalities003_nl_baseline_rf_calib" == * 


Model object named "fatalities003_nl_baseline_rf_calib" with equivalent metadata already exists.
Fetching "fatalities003_nl_baseline_rf_calib" from storage
  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.

  If you are loading a serialized model (like pickle in Python, RDS in R) generate

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to retrieve predictions 2023-11-20 13:50:10.870338
pr_56_cm_fatalities003_nl_conflicthistory_hurdle_lgb_calib.parquet
Test partition 2023-11-20 13:50:15.115849
 * == Performing a run: "fatalities003_nl_conflicthistory_hurdle_lgb_test" == * 
Model object named "fatalities003_nl_conflicthistory_hurdle_lgb_test" with equivalent metadata already exists.
Fetching "fatalities003_nl_conflicthistory_hurdle_lgb_test" from storage


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to retrieve predictions 2023-11-20 13:50:19.442131
pr_56_cm_fatalities003_nl_conflicthistory_hurdle_lgb_test.parquet
**************************************************************
3 fatalities003_nl_conflicthistory_long_xgb
Calibration partition 2023-11-20 13:50:23.571004
 * == Performing a run: "fatalities003_nl_conflicthistory_long_xgb_calib" == * 
Model object named "fatalities003_nl_conflicthistory_long_xgb_calib" with equivalent metadata already exists.
Fetching "fatalities003_nl_conflicthistory_long_xgb_calib" from storage
  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.

  If you are loading a serialized model (like pickle in Python, RDS in R) gen

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to retrieve predictions 2023-11-20 13:51:28.844398
pr_56_cm_fatalities003_nl_topics_hurdle_lgb_calib.parquet
Test partition 2023-11-20 13:51:33.532504
 * == Performing a run: "fatalities003_nl_topics_hurdle_lgb_test" == * 
Model object named "fatalities003_nl_topics_hurdle_lgb_test" with equivalent metadata already exists.
Fetching "fatalities003_nl_topics_hurdle_lgb_test" from storage


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to retrieve predictions 2023-11-20 13:51:37.802310
pr_56_cm_fatalities003_nl_topics_hurdle_lgb_test.parquet
**************************************************************
9 fatalities003_nl_joint_broad_rf
Calibration partition 2023-11-20 13:51:42.430009
 * == Performing a run: "fatalities003_nl_joint_broad_rf_calib" == * 
Model object named "fatalities003_nl_joint_broad_rf_calib" with equivalent metadata already exists.
Fetching "fatalities003_nl_joint_broad_rf_calib" from storage
  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the mode

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to retrieve predictions 2023-11-20 13:52:42.567302
pr_56_cm_fatalities003_nl_joint_narrow_hurdle_lgb_calib.parquet
Test partition 2023-11-20 13:52:46.972420
 * == Performing a run: "fatalities003_nl_joint_narrow_hurdle_lgb_test" == * 
Model object named "fatalities003_nl_joint_narrow_hurdle_lgb_test" with equivalent metadata already exists.
Fetching "fatalities003_nl_joint_narrow_hurdle_lgb_test" from storage


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to retrieve predictions 2023-11-20 13:52:51.684342
pr_56_cm_fatalities003_nl_joint_narrow_hurdle_lgb_test.parquet
**************************************************************
14 fatalities003_nl_all_pca3_xgb
Calibration partition 2023-11-20 13:52:56.136258
 * == Performing a run: "fatalities003_nl_all_pca3_xgb_calib" == * 
Model object named "fatalities003_nl_all_pca3_xgb_calib" with equivalent metadata already exists.
Fetching "fatalities003_nl_all_pca3_xgb_calib" from storage
  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model

In [19]:
# Exploring the future predictions


predictions_test.xs(246,level=1).tail()

Unnamed: 0_level_0,gleditsch_ward,ged_sb_dep,ged_sb,ged_sb_tlag_1,ged_sb_tlag_2,ged_sb_tlag_3,ged_sb_tlag_4,ged_sb_tlag_5,ged_sb_tlag_6,ged_sb_tsum_24,...,step_pred_33,step_pred_34,step_pred_35,step_pred_36,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9
month_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
488,626.0,24.0,24.0,9.0,3.0,13.0,1.0,0.0,0.0,236.0,...,1737.577271,48.681305,11177.571289,7799.896973,374.683197,168.160339,219.422073,84.668091,3.271641,99.145401
489,626.0,1.0,1.0,24.0,9.0,3.0,13.0,1.0,0.0,212.0,...,703.365601,46.209198,11178.731445,7789.881836,374.683197,177.597229,219.422073,104.09507,3.271641,55.153473
490,626.0,0.0,0.0,1.0,24.0,9.0,3.0,13.0,1.0,207.0,...,1745.23999,44.605743,11178.110352,7901.313965,368.315887,184.015594,221.777145,84.668091,453.654602,23.664701
491,626.0,11.0,11.0,0.0,1.0,24.0,9.0,3.0,13.0,218.0,...,759.272461,192.343338,11175.55957,7839.179199,370.41626,207.32724,205.750641,117.136162,453.197845,25.098753
492,626.0,0.0,0.0,11.0,0.0,1.0,24.0,9.0,3.0,179.0,...,1785.327271,246.457077,11341.741211,7804.45459,371.653259,194.72081,237.207809,113.956322,455.407562,23.664701


## Notes on training time for the various algorithms:

In [None]:
#These are calculated in minutes for the hh20 feature set (with about 40 features), for all 36 steps, calibration (c) and test (t) partitions, also include generating predictions, and are approximate:

#nj=12 (number of threads)
#scikit random forest:        21:13 (c), 26:20 (t) RandomForestRegressor(n_estimators=200, n_jobs=nj)
#XGB random forest:           06:02 (c), 07:51 (t) XGBRFRegressor(n_estimators=300,n_jobs=nj)
#scikit gbm:                  13:59 (c), 15:55 (t) GradientBoostingRegressor(), 
#scikit hurdle random forest: 07:32 (c), 09:49 (t) For both clf and reg: (n_estimators=200, n_jobs=nj)
#XGB hurdle xgb:              01:26 (c), 01:32 (t) For both clf and reg:                n_estimators=200,tree_method='hist',n_jobs=nj)
#scikit histgbm:              01:17 (c), 01:20 (t) HistGradientBoostingRegressor(max_iter=200)
#XGB xgb:                     01:00 (c), 01:04 (t) XGBRegressor(n_estimators=200,tree_method='hist',n_jobs=nj)
#lightgbm gbm:                00:25 (c), --    (t) LGBMRegressor(n_estimators=100,num_threads=8)

# Various helper functions and tools....

In [20]:
!conda list | grep views-forecasts

views-forecasts           0.5.4                    pypi_0    pypi


# Retrieving external forecasts

In [21]:
# Retrieve David's Markov models
# To do: rewrite the model dictionary to the new, slimmer version.
DRList = []


model = {
    'modelname':   'fat_hh20_Markov_glm',
    'algorithm': [],
    'depvar': "ln_ged_sb_dep",
    'data_train':      'hh20',
    'queryset': 'hh_20_features',
}
DRList.append(model)

model = {
    'modelname':   'fat_hh20_Markov_rf',
    'algorithm': [],
    'depvar': "ln_ged_sb_dep",
    'data_train':      'hh20',
    'queryset': 'hh_20_features',
}

DRList.append(model)



In [22]:
path = f'/Users/{os.getlogin()}/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/cm/preds/'

DRList[0]['predictions_file_calib'] = path + 'vmm_glm_hh20_0125_alt_calib.csv'
DRList[0]['predictions_file_test'] = path + 'vmm_glm_hh20_0125_alt_test.csv'
DRList[0]['predictions_file_future'] = path + 'vmm_glm_hh20_506.csv'

DRList[1]['predictions_file_calib'] = path + 'vmm_rf_hh20_0125_alt_calib.csv'
DRList[1]['predictions_file_test'] = path + 'vmm_rf_hh20_0125_alt_test.csv'
DRList[1]['predictions_file_future'] = path + 'vmm_rf_hh20_505.csv'

In [23]:
print(path)

/Users/root/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/cm/preds/


In [24]:

for model in ModelList:
    print(model['modelname'])

fatalities003_nl_baseline_rf
fatalities003_nl_conflicthistory_rf
fatalities003_nl_conflicthistory_hurdle_lgb
fatalities003_nl_conflicthistory_long_xgb
fatalities003_nl_vdem_hurdle_xgb
fatalities003_nl_wdi_rf
fatalities003_nl_topics_rf
fatalities003_nl_topics_xgb
fatalities003_nl_topics_hurdle_lgb
fatalities003_nl_joint_broad_rf
fatalities003_nl_joint_broad_hurdle_rf
fatalities003_joint_narrow_xgb
fatalities003_nl_joint_narrow_hurdle_xgb
fatalities003_nl_joint_narrow_hurdle_lgb
fatalities003_nl_all_pca3_xgb


In [25]:
# Storing Markov models in central storage
# Retrieving dependent variable
target_calib = pd.DataFrame.forecasts.read_store('cm_fat_conflicthistory_rf_calib', run=run_id)['ln_ged_sb_dep']
target_test = pd.DataFrame.forecasts.read_store('cm_fat_conflicthistory_rf_test', run=run_id)['ln_ged_sb_dep']
level = 'cm'
for model in DRList:
    df_calib = pd.read_csv(model['predictions_file_calib'],index_col=['month_id','country_id'])
    df_test = pd.read_csv(model['predictions_file_test'],index_col=['month_id','country_id'])
    df_future = pd.read_csv(model['predictions_file_future'],index_col=['month_id','country_id'])
    df_calib['ln_ged_sb_dep'] = target_calib
    df_test['ln_ged_sb_dep'] = target_test
    df_future['ln_ged_sb_dep'] = np.nan # Empty dependent variable column for consistency/required by prediction storage function
    stored_modelname = level + '_' + model['modelname'] + '_calib'
    df_calib.forecasts.set_run(run_id)
    df_calib.forecasts.to_store(name=stored_modelname, overwrite=True)
    stored_modelname = level + '_' + model['modelname'] + '_test'
    df_test.forecasts.set_run(run_id)
    df_test.forecasts.to_store(name=stored_modelname, overwrite=True)    

pr_56_cm_fat_conflicthistory_rf_calib.parquet


KeyError: 'pr_56_cm_fat_conflicthistory_rf_calib.parquet does not exist'

In [None]:
!viewser tables show ged2_pgm


In [None]:
Datasets[1]['df']