
# ViEWS 3 constituent models 
## ViEWS production system, cm level


This notebook trains a set of regression models for use in the monthly updated ViEWS predicting fatalities ensemble

The notebook does the following: 
1. Retrieves data through querysets and stores in DataSets, a list of dictionaries
2. Specifies the metadata of a number of models, stores in ModelList, a list of dictionaries
3. Trains the models in ModelList, stores the trained objects in model storage and prediction storage
4. Saves part of ModelList as csv and the rest as pickles


## Importing modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from xgboost import XGBRFRegressor, XGBRFClassifier

from lightgbm import LGBMClassifier, LGBMRegressor

# Views 3
from viewser.operations import fetch
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
import views_dataviz
from views_runs import storage
from views_runs.storage import store, retrieve, fetch_metadata

from views_forecasts.extensions import *

# Other packages
import pickle as pkl

# Packages from viewsforecasting repository

#from Ensembling import CalibratePredictions, RetrieveStoredPredictions, mean_sd_calibrated, gam_calibrated
import os
import sys
sys.path.append('../')
sys.path.append('../Tools')
sys.path.append('../Intermediates')
from FetchData import FetchData, RetrieveFromList, document_queryset, ReturnQsList, document_ensemble
from ViewsEstimators import *


## Common parameters

In [3]:
!conda list | grep views

# packages in environment at /Users/havardhegre1/mambaforge/envs/viewser:
views-dataviz             0.9.0                    pypi_0    pypi
views-forecasts           0.5.3                    pypi_0    pypi
views-mapper2             1.9.0                    pypi_0    pypi
views-partitioning        3.0.1                    pypi_0    pypi
views-runs                1.13.1                   pypi_0    pypi
views-schema              2.3.0                    pypi_0    pypi
views-storage             1.1.4                    pypi_0    pypi
views-transformation-library 2.4.1                    pypi_0    pypi
viewser                   5.13.0                   pypi_0    pypi


In [4]:
# To do:
# find out why and where missingness occurs

In [5]:
# Common parameters:
dev_id = 'Fatalities003'
run_id = dev_id

# Generating a new run if necessary

#try:
#    ViewsMetadata().new_run(name=run_id,description='Developing the fatalities model for FCDO',min_month=1,max_month=999)
#except KeyError:
#    if 'devel' not in run_id:
#        warnings.warn('You are overwriting a production system')

RerunQuerysets = True

FutureStart = 508
steps = [*range(1, 36+1, 1)] # Which steps to train and predict for
fi_steps = [1,3,6,12,36] # Which steps to present feature importances for
#steps = [1,3,6,12,36]
#fi_steps = [1,3,6,12,36]

# Specifying partitions
calib_partitioner_dict = {"train":(121,396),"predict":(397,444)}
test_partitioner_dict = {"train":(121,444),"predict":(445,492)}
future_partitioner_dict = {"train":(121,492),"predict":(493,504)}
calib_partitioner =  views_runs.DataPartitioner({"calib":calib_partitioner_dict})
test_partitioner =  views_runs.DataPartitioner({"test":test_partitioner_dict})
future_partitioner =  views_runs.DataPartitioner({"future":future_partitioner_dict})

username = os.getlogin()
Mydropbox = f'/Users/{os.getlogin()}/Dropbox (ViEWS)/ViEWS'
overleafpath = f'/Users/{username}/Dropbox (ViEWS)/Apps/Overleaf/VIEWS documentation {dev_id}/'
print('Setting Mydropbox to',Mydropbox)

print('User:', username)
print('Overleaf path set to',overleafpath)

Setting Mydropbox to /Users/havardhegre1/Dropbox (ViEWS)/ViEWS
User: havardhegre1
Overleaf path set to /Users/havardhegre1/Dropbox (ViEWS)/Apps/Overleaf/VIEWS documentation Fatalities003/


# Retrieve data

In [6]:
# Create Markdown documentation of all querysets used
level = 'cm'
qslist = ReturnQsList(level)
document_queryset(qslist,dev_id)

 .    fatalities003_baseline; A dataset with 6 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_baseline_nonlog; A dataset with 6 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_topics_stub; A dataset with 62 columns, with data between t 1 and 852. (213 units)
fatalities003_aquastat_stub; A dataset with 62 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_cm_conflict_history_stub; A dataset with 24 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_cm_conflict_history_ext; A dataset with 33 columns, with data between t = 1 and 852. (213 units)
 .    fatalities003_vdem_short_stub; A dataset with 57 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_wdi_short_stub; A dataset with 26 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_joint_narrow; A dataset with 39 columns, with data between t 1 and 852. (213 units)
 .    fatalities003_joint_broad_stub; A dat

In [7]:
class FixedFirstSplitRegression(BaseEstimator):
    """ Regression model which makes the first split according to a specified feature and then splits according to other 
    algorithms. The model optimizes onset-situation predictions by fitting a two-part model and combining predictions:
            1) binary classifier
            2) continuous regression
    Implementeted as a valid sklearn estimator, so it can be used in pipelines and GridSearch objects.
    Args:
        ones_name: model to estimate if z variable is one (e.g. "onset")
        zeros_name: model to estimate if z variable is zeros (e.g. "continuation")
        ones_params: dict of parameters to pass to "ones" sub-model when initialized
        zeros_params: dict of parameters to pass to "zeros" sub-model when initialized
    """

    def __init__(self,
                 ones_name: str = 'RFRegressor',
                 zeros_name: str = 'RFRegressor',
                 ones_indicator: str = '',
                 ones_params: Optional[dict] = None,
                 zeros_params: Optional[dict] = None):

        self.ones_name = ones_name
        self.zeros_name = zeros_name
        self.ones_indicator = ones_indicator
        self.ones_params = ones_params
        self.zeros_params = zeros_params
        self.ones_fi = []
        self.zeros_fi = []

    @staticmethod
    def _resolve_estimator(func_name: str):
        """ Lookup table for supported estimators.
        This is necessary because sklearn estimator default arguments
        must pass equality test, and instantiated sub-estimators are not equal. """

        funcs = {'linear': LinearRegression(),
                 'logistic': LogisticRegression(solver='liblinear'),
                 'LGBMRegressor': LGBMRegressor(n_estimators=250),
                 'LGBMClassifier': LGBMClassifier(n_estimators=250),
                 'RFRegressor': XGBRFRegressor(n_estimators=250,n_jobs=-2),
                 'RFClassifier': XGBRFClassifier(n_estimators=250,n_jobs=-2),
                 'GBMRegressor': GradientBoostingRegressor(n_estimators=200),
                 'GBMClassifier': GradientBoostingClassifier(n_estimators=200),
                 'XGBRegressor': XGBRegressor(n_estimators=100,learning_rate=0.05,n_jobs=-2),
                 'XGBClassifier': XGBClassifier(n_estimators=100,learning_rate=0.05,n_jobs=-2),
                 'HGBRegressor': HistGradientBoostingRegressor(max_iter=200),
                 'HGBClassifier': HistGradientBoostingClassifier(max_iter=200),
                }

        return funcs[func_name]

    def fit(self,
            X: Union[np.ndarray, pd.DataFrame],
            y: Union[np.ndarray, pd.Series],
            z: Union[np.ndarray, pd.Series]):
        X, y = check_X_y(X, y, dtype=None,
                         accept_sparse=False,
                         accept_large_sparse=False,
                         force_all_finite='allow-nan')
        z = X[ones_indicator]

        if X.shape[1] < 2:
            raise ValueError('Cannot fit model when n_features = 1')

        self.ones_ = self._resolve_estimator(self.ones_name)
        if self.ones_params:
            self.ones_.set_params(**self.ones_params)
        self.ones_.fit(X[z==1], y[z==1])
        self.ones_fi = self.ones_.feature_importances_

        self.zeros_ = self._resolve_estimator(self.zeros_name)
        if self.zeros_params:
            self.zeros_.set_params(**self.zeros_params)
        self.zeros_.fit(X[z==0], y[z==0])
        self.zeros_fi = self.zeros_.feature_importances_

        self.is_fitted_ = True
        return self


    def predict(self, X: Union[np.ndarray, pd.DataFrame]):
#    def predict_expected_value(self, X: Union[np.ndarray, pd.DataFrame]):
        """ Predict combined response using probabilistic classification outcome """
        X = check_array(X, accept_sparse=False, accept_large_sparse=False)
        check_is_fitted(self, 'is_fitted_')
#        predict = 
        return self.clf_.predict_proba(X)[:, 1] * self.reg_.predict(X)

def manual_test():
    """ Validate estimator using sklearn's provided utility and ensure it can fit and predict on fake dataset. """
    check_estimator(HurdleRegression)
    from sklearn.datasets import make_regression
    X, y = make_regression()
    reg = FixedFirstSplitRegression()
    reg.fit(X, y)
    reg.predict(X)
    


In [8]:
FixedFirstSplitRegression(ones_name='LGBMClassifier', zeros_name='LGBMRegressor', ones_indicator = '')

In [9]:
from FetchData import fetch_cm_data_from_model_def

Datasets=fetch_cm_data_from_model_def(qslist)

 .    conflictlong_ln: A dataset with 63 columns, with data between t = 1 and 852; 213 units.
 .    vdem_short: A dataset with 63 columns, with data between t = 1 and 852; 213 units.
 .    joint_broad: A dataset with 108 columns, with data between t = 1 and 852; 213 units.
 .    topics_003: A dataset with 68 columns, with data between t = 1 and 852; 213 units.
 .    wdi_short: A dataset with 32 columns, with data between t = 1 and 852; 213 units.
 .    all_features: A dataset with 186 columns, with data between t = 1 and 852; 213 units.
 .    baseline003: A dataset with 6 columns, with data between t = 1 and 852; 213 units.
 .    joint_narrow: A dataset with 39 columns, with data between t = 1 and 852; 213 units.
 .    conflict_ln: A dataset with 30 columns, with data between t = 1 and 852; 213 units.
all_features [9.99840484e-01 1.59318589e-04 1.96990535e-07 2.59914144e-13
 5.37470633e-16 5.77311028e-17 4.60799782e-17 1.25960180e-17
 9.98533092e-18 3.88488306e-18 5.68618444e-19 1.0384

In [10]:
qslist[1]

Queryset(name='fatalities003_topics', loa='country_month', themes=['fatalities003'], description='Predicting ln(fatalities), cm level\n    \n                           Queryset with baseline and Mueller & Rauh topic model features\n    \n                           ', operations=[[RenameOperation(namespace='trf', name='util.rename', arguments=['ged_sb_dep']), TransformOperation(namespace='trf', name='missing.fill', arguments=[]), DatabaseOperation(namespace='base', name='ged2_cm.ged_sb_best_sum_nokgi', arguments=['values'])], [RenameOperation(namespace='trf', name='util.rename', arguments=['ged_sb']), TransformOperation(namespace='trf', name='missing.fill', arguments=[]), DatabaseOperation(namespace='base', name='ged2_cm.ged_sb_best_sum_nokgi', arguments=['values'])], [RenameOperation(namespace='trf', name='util.rename', arguments=['decay_ged_sb_5']), TransformOperation(namespace='trf', name='missing.replace_na', arguments=[]), TransformOperation(namespace='trf', name='temporal.decay', 

# Generating predictions
Using the ViEWS3 partitioning/stepshifting syntax. Training models for A: calibration partition and B: test partition, to test out some calibration routines. Most models trained with ln_ged_sb_best as outcome.

In [11]:
dev_id

'Fatalities003'

In [12]:
Datasets[0]['df'].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,gleditsch_ward,ged_sb_dep,ged_sb,ged_sb_tlag_1,ged_sb_tlag_2,ged_sb_tlag_3,ged_sb_tlag_4,ged_sb_tlag_5,ged_sb_tlag_6,ged_sb_tsum_24,...,ln_acled_os_tlag_1,ln_acled_os_tlag_2,ln_acled_ns_tlag_1,ln_acled_ns_tlag_2,splag_1_decay_ged_sb_5,splag_1_decay_ged_os_5,splag_1_decay_ged_ns_5,splag_1_decay_ged_sb_100,splag_1_decay_ged_os_100,splag_1_decay_ged_ns_100
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,110.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,101.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,990.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from views_runs import ModelMetadata 
help(ModelMetadata)

Help on class ModelMetadata in module views_schema.models:

class ModelMetadata(pydantic.main.BaseModel)
 |  ModelMetadata(*, author: str, queryset_name: str, train_start: int, train_end: int, steps: List[int] = None, training_date: datetime.datetime) -> None
 |  
 |  ModelMetadata
 |  
 |  Data used to organize model objects.
 |  
 |  parameters:
 |      author (str): Name of the user that authored the model object.
 |      queryset_name (str): Name of the queryset used to train the model
 |      train_start (int): Month identifier for training start date
 |      train_start (int): Month identifier for training end date
 |      training_date (datetime.datetime): Timestamp for training date (use datetime.datetime.now())
 |  
 |  example:
 |  
 |      # Instantiate the class with values
 |  
 |      my_metadata = ModelMetadata(
 |          author = "my_name",
 |          queryset_name = "my_queryset",
 |          train_start = 1,
 |          train_end = 300,
 |          steps = [1,2,3],

## Checking missingness and infinity values

In [14]:
N=51
for i in range(len(Datasets)):
    df = Datasets[i]['df']
    print(Datasets[i]['Name'])
    for col in df.iloc[: , :N].columns:
        if df[col].isnull().sum() > 0 or np.isinf(df).values.sum() > 0:
            print(col,len(df[col]), 'missing:', df[col].isnull().sum(), 'infinity:', np.isinf(df).values.sum())


conflictlong_ln
vdem_short
joint_broad
topics_003
wdi_short
all_features
baseline003
joint_narrow
conflict_ln
pca_all
pca_topics
pca_vdem
pca_wdi


# Specify models in ensemble

In [15]:
from ModelDefinitions import DefineEnsembleModels

ModelList = DefineEnsembleModels('cm')
    
for imodel,model in enumerate(ModelList):
    print(imodel, model['modelname'], model['data_train'])

0 fatalities003_nl_baseline_rf baseline003
1 fatalities003_nl_conflicthistory_rf conflict_ln
2 fatalities003_nl_conflicthistory_hurdle_lgb conflict_ln
3 fatalities003_nl_conflicthistory_long_xgb conflictlong_ln
4 fatalities003_nl_vdem_hurdle_xgb vdem_short
5 fatalities003_nl_wdi_rf wdi_short
6 fatalities003_nl_topics_rf topics_003
7 fatalities003_nl_topics_xgb topics_003
8 fatalities003_nl_topics_hurdle_lgb topics_003
9 fatalities003_nl_joint_broad_rf joint_broad
10 fatalities003_nl_joint_broad_hurdle_rf joint_broad
11 fatalities003_joint_narrow_xgb joint_narrow
12 fatalities003_nl_joint_narrow_hurdle_xgb joint_narrow
13 fatalities003_nl_joint_narrow_hurdle_lgb joint_narrow
14 fatalities003_nl_all_pca3_xgb all_features


In [16]:
ModelList

[{'modelname': 'fatalities003_nl_baseline_rf',
  'algorithm': XGBRFRegressor(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bytree=None,
                 early_stopping_rounds=None, enable_categorical=False,
                 eval_metric=None, gamma=None, gpu_id=None, grow_policy=None,
                 importance_type=None, interaction_constraints=None, max_bin=None,
                 max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
                 max_leaves=None, min_child_weight=None, missing=nan,
                 monotone_constraints=None, n_estimators=300, n_jobs=12,
                 num_parallel_tree=None, objective='reg:squarederror',
                 predictor=None, random_state=None, reg_alpha=None,
                 sampling_method=None, scale_pos_weight=None, ...),
  'depvar': 'ged_sb_dep',
  'data_train': 'baseline003',
  'queryset': 'fatalities003_baseline',
  'preprocessing': 'float_it',
  'level': 'cm',
  'desc

In [17]:
outcome = 'sb'
EnsembleMetaData_df = document_ensemble(ModelList,outcome)
if username == 'havardhegre1':
    filename = overleafpath + f'Tables/Evaluation/Ensemble_{outcome}.md'
    EnsembleMetaData_df.to_markdown(index=False, buf=filename)

0 fatalities003_nl_baseline_rf baseline003


In [18]:
# Loop that checks whether the model exists, retrains if not, 
# and stores the predictions if they have not been stored before for this run.
# To do: set the data_preprocessing to the function in the model dictionary

level = 'cm'
includeFuture = False

from views_runs import Storage, StepshiftedModels
from views_partitioning.data_partitioner import DataPartitioner
from viewser import Queryset, Column
from views_runs import operations
from views_runs.run_result import RunResult

i = 0
for model in ModelList:
    if model['algorithm'] != 'Rscript':
        force_retrain = False
        modelstore = storage.Storage()
        ct = datetime.now()
        print(i, model['modelname'])
        print('Calibration partition', ct)
        model['Algorithm_text'] = str(model['algorithm'])
        model['RunResult_calib'] = RunResult.retrain_or_retrieve(
                retrain            = force_retrain,
                store              = modelstore,
                partitioner        = DataPartitioner({"calib":calib_partitioner_dict}),
                stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                dataset            = RetrieveFromList(Datasets,model['data_train']),
                queryset_name      = model['queryset'],
                partition_name     = "calib",
                timespan_name      = "train",
                storage_name       = model['modelname'] + '_calib',
                author_name        = "HH",
        )

    #    model['predstore_calib'] = level +  '_' + model['modelname'] + '_calib'
        ct = datetime.now()
        print('Trying to retrieve predictions', ct)
        try:
            predictions_calib = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_calib'])
        except KeyError:
            print(model['predstore_calib'], ', run',  run_id, 'does not exist, predicting')
            predictions_calib = model['RunResult_calib'].run.predict("calib","predict", model['RunResult_calib'].data)
            predictions_calib.forecasts.set_run(run_id)
            predictions_calib.forecasts.to_store(name=model['predstore_calib'])

        ct = datetime.now()
        print('Test partition', ct)
        modelstore = storage.Storage()
        model['RunResult_test'] = RunResult.retrain_or_retrieve(
                retrain            = force_retrain,
                store              = modelstore,
                partitioner        = DataPartitioner({"test":test_partitioner_dict}),
                stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                dataset            = RetrieveFromList(Datasets,model['data_train']),
                queryset_name      = model['queryset'],
                partition_name     = "test",
                timespan_name      = "train",
                storage_name       = model['modelname'] + '_test',
                author_name        = "HH",
        )
        ct = datetime.now()
        print('Trying to retrieve predictions', ct)
    #    model['predstore_test'] = level +  '_' + model['modelname'] + '_test'
        try:
            predictions_test = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_test'])
        except KeyError:
            print(model['predstore_test'], ', run', run_id, 'does not exist, predicting')
            predictions_test = model['RunResult_test'].run.predict("test","predict",model['RunResult_test'].data)
            predictions_test.forecasts.set_run(run_id)
            predictions_test.forecasts.to_store(name=model['predstore_test'])
        # Predictions for true future
        if includeFuture:
            ct = datetime.now()
            print('Future', ct)
            modelstore = storage.Storage()
            model['RunResult_future'] = RunResult.retrain_or_retrieve(
                    retrain            = force_retrain,
                    store              = modelstore,
                    partitioner        = DataPartitioner({"test":future_partitioner_dict}),
                    stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                    dataset            = RetrieveFromList(Datasets,model['data_train']),
                    queryset_name      = model['queryset'],
                    partition_name     = "test",
                    timespan_name      = "train",
                    storage_name       = model['modelname'] + '_future',
                    author_name        = "HH",
            )
            ct = datetime.now()
            print('Trying to retrieve predictions', ct)
            model['predstore_future'] = level +  '_' + model['modelname'] + '_f' + str(FutureStart)
            try:
                predictions_future = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_future'])
            except KeyError:
                print(model['predstore_future'], ', run', run_id, 'does not exist, predicting')
                predictions_future = model['RunResult_future'].run.future_point_predict(FutureStart,model['RunResult_future'].data)
                predictions_future.forecasts.set_run(run_id)
                predictions_future.forecasts.to_store(name=model['predstore_future'])  
        print('**************************************************************')
    i = i + 1

print('All done')

0 fatalities003_nl_baseline_rf
Calibration partition 2023-05-27 17:14:22.053192
 * == Performing a run: "fatalities003_nl_baseline_rf_calib" == * 
Training model(s)...
Storing "fatalities003_nl_baseline_rf_calib"
Trying to retrieve predictions 2023-05-27 17:15:37.589809
pr_56_cm_fatalities003_nl_baseline_rf_calib.parquet
cm_fatalities003_nl_baseline_rf_calib , run Fatalities003 does not exist, predicting
Test partition 2023-05-27 17:15:48.309450
 * == Performing a run: "fatalities003_nl_baseline_rf_test" == * 
Training model(s)...
Storing "fatalities003_nl_baseline_rf_test"
Trying to retrieve predictions 2023-05-27 17:17:09.206490
pr_56_cm_fatalities003_nl_baseline_rf_test.parquet
cm_fatalities003_nl_baseline_rf_test , run Fatalities003 does not exist, predicting
**************************************************************
1 fatalities003_nl_conflicthistory_rf
Calibration partition 2023-05-27 17:17:19.901099
 * == Performing a run: "fatalities003_nl_conflicthistory_rf_calib" == * 
Tr

Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities003_nl_conflicthistory_rf_calib"
Trying to retrieve predictions 2023-05-27 17:18:56.605774
pr_56_cm_fatalities003_nl_conflicthistory_rf_calib.parquet
cm_fatalities003_nl_conflicthistory_rf_calib , run Fatalities003 does not exist, predicting
Test partition 2023-05-27 17:19:07.316329
 * == Performing a run: "fatalities003_nl_conflicthistory_rf_test" == * 
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities003_nl_conflicthistory_rf_test"
Trying to retrieve predictions 2023-05-27 17:20:50.890141
pr_56_cm_fatalities003_nl_conflicthistory_rf_test.parquet
cm_fatalities003_nl_conflicthistory_rf_test , run Fatalities003 does not exist, predicting
**************************************************************
2 fatalities003_nl_conflicthistory_hurdle_lgb
Calibration partition 2023-05-27 17:21:01.402417
 * == Performing a run: "fatalities003_nl_conflicthistory_hurdle_lgb_calib" == * 
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities003_nl_conflicthistory_hurdle_lgb_calib"
Trying to retrieve predictions 2023-05-27 17:22:14.744888
pr_56_cm_fatalities003_nl_conflicthistory_hurdle_lgb_calib.parquet
cm_fatalities003_nl_conflicthistory_hurdle_lgb_calib , run Fatalities003 does not exist, predicting
Test partition 2023-05-27 17:22:26.769729
 * == Performing a run: "fatalities003_nl_conflicthistory_hurdle_lgb_test" == * 
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities003_nl_conflicthistory_hurdle_lgb_test"
Trying to retrieve predictions 2023-05-27 17:23:40.670506
pr_56_cm_fatalities003_nl_conflicthistory_hurdle_lgb_test.parquet
cm_fatalities003_nl_conflicthistory_hurdle_lgb_test , run Fatalities003 does not exist, predicting
**************************************************************
3 fatalities003_nl_conflicthistory_long_xgb
Calibration partition 2023-05-27 17:23:52.886369
 * == Performing a run: "fatalities003_nl_conflicthistory_long_xgb_calib" == * 
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities003_nl_conflicthistory_long_xgb_calib"
Trying to retrieve predictions 2023-05-27 17:24:50.345480
pr_56_cm_fatalities003_nl_conflicthistory_long_xgb_calib.parquet
cm_fatalities003_nl_conflicthistory_long_xgb_calib , run Fatalities003 does not exist, predicting
Test partition 2023-05-27 17:25:00.791201
 * == Performing a run: "fatalities003_nl_conflicthistory_long_xgb_test" == * 
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities003_nl_conflicthistory_long_xgb_test"
Trying to retrieve predictions 2023-05-27 17:26:08.151158
pr_56_cm_fatalities003_nl_conflicthistory_long_xgb_test.parquet
cm_fatalities003_nl_conflicthistory_long_xgb_test , run Fatalities003 does not exist, predicting
**************************************************************
4 fatalities003_nl_vdem_hurdle_xgb
Calibration partition 2023-05-27 17:26:19.074987
 * == Performing a run: "fatalities003_nl_vdem_hurdle_xgb_calib" == * 
Training model(s)...
Storing "fatalities003_nl_vdem_hurdle_xgb_calib"
Trying to retrieve predictions 2023-05-27 17:27:37.671246
pr_56_cm_fatalities003_nl_vdem_hurdle_xgb_calib.parquet
cm_fatalities003_nl_vdem_hurdle_xgb_calib , run Fatalities003 does not exist, predicting
Test partition 2023-05-27 17:27:48.192460
 * == Performing a run: "fatalities003_nl_vdem_hurdle_xgb_test" == * 
Training model(s)...
Storing "fatalities003_nl_vdem_hurdle_xgb_test"
Trying to retrieve predictions 2023-05-27 17:29:20.4

Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities003_nl_joint_broad_rf_calib"
Trying to retrieve predictions 2023-05-27 17:47:17.408897
pr_56_cm_fatalities003_nl_joint_broad_rf_calib.parquet
cm_fatalities003_nl_joint_broad_rf_calib , run Fatalities003 does not exist, predicting
Test partition 2023-05-27 17:47:28.250544
 * == Performing a run: "fatalities003_nl_joint_broad_rf_test" == * 
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities003_nl_joint_broad_rf_test"
Trying to retrieve predictions 2023-05-27 17:50:57.802263
pr_56_cm_fatalities003_nl_joint_broad_rf_test.parquet
cm_fatalities003_nl_joint_broad_rf_test , run Fatalities003 does not exist, predicting
**************************************************************
10 fatalities003_nl_joint_broad_hurdle_rf
Calibration partition 2023-05-27 17:51:08.746804
 * == Performing a run: "fatalities003_nl_joint_broad_hurdle_rf_calib" == * 
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities003_nl_joint_broad_hurdle_rf_calib"
Trying to retrieve predictions 2023-05-27 17:54:58.474504
pr_56_cm_fatalities003_nl_joint_broad_hurdle_rf_calib.parquet
cm_fatalities003_nl_joint_broad_hurdle_rf_calib , run Fatalities003 does not exist, predicting
Test partition 2023-05-27 17:55:09.517463
 * == Performing a run: "fatalities003_nl_joint_broad_hurdle_rf_test" == * 
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities003_nl_joint_broad_hurdle_rf_test"
Trying to retrieve predictions 2023-05-27 17:59:57.078598
pr_56_cm_fatalities003_nl_joint_broad_hurdle_rf_test.parquet
cm_fatalities003_nl_joint_broad_hurdle_rf_test , run Fatalities003 does not exist, predicting
**************************************************************
11 fatalities003_joint_narrow_xgb
Calibration partition 2023-05-27 18:00:08.442041
 * == Performing a run: "fatalities003_joint_narrow_xgb_calib" == * 
Model object named "fatalities003_joint_narrow_xgb_calib" with equivalent metadata already exists.
Fetching "fatalities003_joint_narrow_xgb_calib" from storage
Trying to retrieve predictions 2023-05-27 18:00:10.161423
pr_56_cm_fatalities003_joint_narrow_xgb_calib.parquet
Test partition 2023-05-27 18:00:13.693742
 * == Performing a run: "fatalities003_joint_narrow_xgb_test" == * 
Model object named "fatalities003_joint_narrow_xgb_test" with equivalent metadata already exists.
Fetching "fatalities003_joint_narrow_x

ValueError: Feature ged_sb_dep is not a feature in array

In [None]:
# Exploring the future predictions


predictions_test.xs(246,level=1).tail()

## Notes on training time for the various algorithms:

In [None]:
#These are calculated in minutes for the hh20 feature set (with about 40 features), for all 36 steps, calibration (c) and test (t) partitions, also include generating predictions, and are approximate:

#nj=12 (number of threads)
#scikit random forest:        21:13 (c), 26:20 (t) RandomForestRegressor(n_estimators=200, n_jobs=nj)
#XGB random forest:           06:02 (c), 07:51 (t) XGBRFRegressor(n_estimators=300,n_jobs=nj)
#scikit gbm:                  13:59 (c), 15:55 (t) GradientBoostingRegressor(), 
#scikit hurdle random forest: 07:32 (c), 09:49 (t) For both clf and reg: (n_estimators=200, n_jobs=nj)
#XGB hurdle xgb:              01:26 (c), 01:32 (t) For both clf and reg:                n_estimators=200,tree_method='hist',n_jobs=nj)
#scikit histgbm:              01:17 (c), 01:20 (t) HistGradientBoostingRegressor(max_iter=200)
#XGB xgb:                     01:00 (c), 01:04 (t) XGBRegressor(n_estimators=200,tree_method='hist',n_jobs=nj)
#lightgbm gbm:                00:25 (c), --    (t) LGBMRegressor(n_estimators=100,num_threads=8)

# Various helper functions and tools....

In [None]:
!conda list | grep views-forecasts

# Retrieving external forecasts

In [None]:
# Retrieve David's Markov models
# To do: rewrite the model dictionary to the new, slimmer version.
DRList = []


model = {
    'modelname':   'fat_hh20_Markov_glm',
    'algorithm': [],
    'depvar': "ln_ged_sb_dep",
    'data_train':      'hh20',
    'queryset': 'hh_20_features',
}
DRList.append(model)

model = {
    'modelname':   'fat_hh20_Markov_rf',
    'algorithm': [],
    'depvar': "ln_ged_sb_dep",
    'data_train':      'hh20',
    'queryset': 'hh_20_features',
}

DRList.append(model)



In [None]:
path = f'/Users/{os.getlogin()}/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/cm/preds/'

DRList[0]['predictions_file_calib'] = path + 'vmm_glm_hh20_0125_alt_calib.csv'
DRList[0]['predictions_file_test'] = path + 'vmm_glm_hh20_0125_alt_test.csv'
DRList[0]['predictions_file_future'] = path + 'vmm_glm_hh20_506.csv'

DRList[1]['predictions_file_calib'] = path + 'vmm_rf_hh20_0125_alt_calib.csv'
DRList[1]['predictions_file_test'] = path + 'vmm_rf_hh20_0125_alt_test.csv'
DRList[1]['predictions_file_future'] = path + 'vmm_rf_hh20_505.csv'

In [None]:
print(path)

In [None]:

for model in ModelList:
    print(model['modelname'])

In [None]:
# Storing Markov models in central storage
# Retrieving dependent variable
target_calib = pd.DataFrame.forecasts.read_store('cm_fat_conflicthistory_rf_calib', run=run_id)['ln_ged_sb_dep']
target_test = pd.DataFrame.forecasts.read_store('cm_fat_conflicthistory_rf_test', run=run_id)['ln_ged_sb_dep']
level = 'cm'
for model in DRList:
    df_calib = pd.read_csv(model['predictions_file_calib'],index_col=['month_id','country_id'])
    df_test = pd.read_csv(model['predictions_file_test'],index_col=['month_id','country_id'])
    df_future = pd.read_csv(model['predictions_file_future'],index_col=['month_id','country_id'])
    df_calib['ln_ged_sb_dep'] = target_calib
    df_test['ln_ged_sb_dep'] = target_test
    df_future['ln_ged_sb_dep'] = np.nan # Empty dependent variable column for consistency/required by prediction storage function
    stored_modelname = level + '_' + model['modelname'] + '_calib'
    df_calib.forecasts.set_run(run_id)
    df_calib.forecasts.to_store(name=stored_modelname, overwrite=True)
    stored_modelname = level + '_' + model['modelname'] + '_test'
    df_test.forecasts.set_run(run_id)
    df_test.forecasts.to_store(name=stored_modelname, overwrite=True)    

In [None]:
!viewser tables show ged2_pgm


In [None]:
Datasets[1]['df']