# Notebook to define ensemble for production, cm level
Version developed for ViEWS monthly updates: Fatalities002
## Including ensemble weighting

This notebook defines the ensemble used for production: selects a set of pre-trained models, retrieves and calibrates them, computes weights, and computes and stores the ensemble model predictions.

Models are stored in model storage and most of them specified in the notebook fat_cm_constituentmodels

The notebook draws on the following files in this repository:

Script file: 
    Ensembling.py
    FetchData.py

Lists of models:
    ModelList_cm_{dev_id}.csv (not yet functional)
    List of pickles at local directory (will rewrite to drop dependence on this)

# Note
### Numbers in the models 11, 12, 13 are log values even if the column for model 12, 13 is ged_sb_dep



In [25]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook

# Views 3
from viewser.operations import fetch
from viewser import Queryset, Column
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
# import views_dataviz
from views_runs import storage, ModelMetadata
from views_runs.storage import store, retrieve, fetch_metadata
from views_forecasts.extensions import *

#sklearn
from sklearn.metrics import mean_squared_error

# Other packages
import pickle as pkl

pd.set_option('display.float_format', lambda x: '%.6f' % x)

In [26]:
# Packages from this repository, Tools folder
import sys
sys.path.append('../')
sys.path.append('../Tools')
sys.path.append('../Intermediates')
import os

from Ensembling import CalibratePredictions, RetrieveStoredPredictions, mean_sd_calibrated, gam_calibrated
from FetchData import FetchData, RetrieveFromList
from ViewsEstimators import *


In [27]:
# Common parameters:

dev_id = 'Fatalities003'
run_id = 'Fatalities003'
EndOfHistory = 509
RunGeneticAlgo = True
level = 'cm'
get_future = False

username = os.getlogin()

steps = [*range(1, 36+1, 1)] # Which steps to train and predict for

fi_steps = [1,3,6,12,36]
# Specifying partitions

calib_partitioner_dict = {"train":(121,396),"predict":(397,444)}
test_partitioner_dict = {"train":(121,444),"predict":(445,492)}
future_partitioner_dict = {"train":(121,492),"predict":(493,504)}
calib_partitioner =  views_runs.DataPartitioner({"calib":calib_partitioner_dict})
test_partitioner =  views_runs.DataPartitioner({"test":test_partitioner_dict})
future_partitioner =  views_runs.DataPartitioner({"future":future_partitioner_dict})

Mydropbox = f'/Users/{username}/Dropbox (ViEWS)/ViEWS/'
localpath = f'/Users/{username}/Pickles/'
overleafpath = f'/Users/{username}/Dropbox (ViEWS)/Apps/Overleaf/VIEWS documentation {dev_id}/'

print('User:', username)
print('Dropbox path set to',Mydropbox)
print('Overleaf path set to',overleafpath)
print('Local path set to',localpath)

User: root
Dropbox path set to /Users/root/Dropbox (ViEWS)/ViEWS/
Overleaf path set to /Users/root/Dropbox (ViEWS)/Apps/Overleaf/VIEWS documentation Fatalities003/
Local path set to /Users/root/Pickles/


In [28]:
from ModelDefinitions import DefineEnsembleModels

ModelList = DefineEnsembleModels(level)
    
i = 0
for model in ModelList:
    print(i, model['modelname'], model['data_train'])
    i = i + 1

0 fatalities003_nl_baseline_rf baseline003
1 fatalities003_nl_conflicthistory_rf conflict_ln
2 fatalities003_nl_conflicthistory_hurdle_lgb conflict_ln
3 fatalities003_nl_conflicthistory_long_xgb conflictlong_ln
4 fatalities003_nl_vdem_hurdle_xgb vdem_short
5 fatalities003_nl_wdi_rf wdi_short
6 fatalities003_nl_topics_rf topics_003
7 fatalities003_nl_topics_xgb topics_003
8 fatalities003_nl_topics_hurdle_lgb topics_003
9 fatalities003_nl_joint_broad_rf joint_broad
10 fatalities003_nl_joint_broad_hurdle_rf joint_broad
11 fatalities003_joint_narrow_xgb joint_narrow
12 fatalities003_nl_joint_narrow_hurdle_xgb joint_narrow
13 fatalities003_nl_joint_narrow_hurdle_lgb joint_narrow
14 fatalities003_nl_all_pca3_xgb all_features


In [29]:
stepcols = ['ln_ged_sb_dep']
for step in steps:
    stepcols.append('step_pred_' + str(step))
stored_modelname_calib = level + '_' + ModelList[11]['modelname'] + '_calib'
pd.DataFrame.forecasts.read_store(stored_modelname_calib, run=dev_id)[stepcols]

pr_56_cm_fatalities003_joint_narrow_xgb_calib.parquet


Unnamed: 0_level_0,Unnamed: 1_level_0,ln_ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
397,1,0.000000,0.008420,0.011582,0.012673,0.014179,0.014431,0.014855,0.016646,0.020506,0.017444,...,0.024846,0.026930,0.027107,0.117257,0.025644,0.025099,0.024437,0.022103,0.022669,0.023872
397,2,0.000000,0.008420,0.011476,0.012418,0.014019,0.013617,0.014294,0.016476,0.021644,0.018288,...,0.022748,0.021641,0.022437,0.021148,0.020752,0.019647,0.018917,0.018413,0.019921,0.020009
397,3,0.000000,0.008420,0.011476,0.012418,0.014019,0.013617,0.014294,0.015214,0.015707,0.018288,...,0.022748,0.021641,0.022437,0.021148,0.020752,0.019647,0.018917,0.018413,0.019921,0.020009
397,4,0.000000,0.008659,0.011582,0.012673,0.014421,0.014431,0.017202,0.045097,0.018661,0.022210,...,0.074820,0.036520,0.056549,0.050284,0.061734,0.053394,0.067794,0.065420,0.123116,0.128227
397,5,0.000000,0.008420,0.011476,0.012418,0.014019,0.013617,0.014294,0.016476,0.021644,0.018288,...,0.022748,0.021641,0.022437,0.021148,0.020752,0.019647,0.018917,0.018413,0.019921,0.020009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,242,0.000000,0.017873,0.019947,0.019739,0.025537,0.019502,0.023194,0.020523,0.021112,0.024462,...,0.133979,0.135839,0.137115,0.150589,0.164453,0.165343,0.166977,0.167235,0.175178,0.169526
444,243,0.000000,0.008511,0.011476,0.012418,0.014019,0.013617,0.014294,0.015214,0.015707,0.015991,...,0.023141,0.022411,0.023122,0.021496,0.021082,0.019647,0.019860,0.019287,0.020205,0.020908
444,244,0.000000,0.009407,0.012381,0.012647,0.014203,0.013976,0.014618,0.015567,0.017467,0.018470,...,0.061876,0.107264,0.122521,0.117023,0.057673,0.051980,0.062827,0.053596,0.062296,0.038805
444,245,0.000000,1.889669,2.477441,3.376249,2.985935,3.317934,2.572747,3.731551,4.061934,4.168920,...,1.934937,2.045012,1.900755,2.903455,3.046722,2.139882,2.063412,2.089220,2.078251,1.945472


# Retrieve and calibrate predictions

In [30]:
# Retrieving the predictions for calibration and test partitions
# The ModelList contains the predictions organized by model

ModelList = RetrieveStoredPredictions(ModelList, steps, EndOfHistory, dev_id, level, get_future)

# ModelList = CalibratePredictions(ModelList, EndOfHistory, steps)

0 fatalities003_nl_baseline_rf
pr_56_cm_fatalities003_nl_baseline_rf_calib.parquet
pr_56_cm_fatalities003_nl_baseline_rf_test.parquet
1 fatalities003_nl_conflicthistory_rf
pr_56_cm_fatalities003_nl_conflicthistory_rf_calib.parquet
pr_56_cm_fatalities003_nl_conflicthistory_rf_test.parquet
2 fatalities003_nl_conflicthistory_hurdle_lgb
pr_56_cm_fatalities003_nl_conflicthistory_hurdle_lgb_calib.parquet
pr_56_cm_fatalities003_nl_conflicthistory_hurdle_lgb_test.parquet
3 fatalities003_nl_conflicthistory_long_xgb
pr_56_cm_fatalities003_nl_conflicthistory_long_xgb_calib.parquet
pr_56_cm_fatalities003_nl_conflicthistory_long_xgb_test.parquet
4 fatalities003_nl_vdem_hurdle_xgb
pr_56_cm_fatalities003_nl_vdem_hurdle_xgb_calib.parquet
pr_56_cm_fatalities003_nl_vdem_hurdle_xgb_test.parquet
5 fatalities003_nl_wdi_rf
pr_56_cm_fatalities003_nl_wdi_rf_calib.parquet
pr_56_cm_fatalities003_nl_wdi_rf_test.parquet
6 fatalities003_nl_topics_rf
pr_56_cm_fatalities003_nl_topics_rf_calib.parquet
pr_56_cm_fatali

In [31]:
ModelList[11]['predictions_calib_df'] = ModelList[11]['predictions_calib_df'].applymap(lambda x: np.exp(x) - 1)
ModelList[11]['predictions_calib_df'].rename(columns={'ln_ged_sb_dep':'ged_sb_dep'}, inplace=True)
ModelList[11]['predictions_test_df'] = ModelList[11]['predictions_test_df'].applymap(lambda x: np.exp(x) - 1)
ModelList[11]['predictions_test_df'].rename(columns={'ln_ged_sb_dep':'ged_sb_dep'}, inplace=True)
ModelList[11]['predictions_test_df']

Unnamed: 0_level_0,Unnamed: 1_level_0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
445,1,0.000000,0.009167,0.009891,0.012589,0.013190,0.013403,0.015648,0.015318,0.028433,0.018103,...,0.024727,0.034935,0.030096,0.031895,0.026588,0.028692,0.028179,0.025854,0.027549,0.028055
445,2,0.000000,0.009167,0.009697,0.012589,0.013058,0.013226,0.014898,0.015174,0.028433,0.018103,...,0.019271,0.019915,0.021934,0.020112,0.019567,0.019655,0.018728,0.017578,0.017987,0.018030
445,3,0.000000,0.009167,0.009697,0.011348,0.013058,0.013226,0.014898,0.015174,0.014908,0.014455,...,0.019271,0.019915,0.021934,0.020112,0.019567,0.019655,0.018728,0.017578,0.017987,0.018030
445,4,0.000000,0.012901,0.020516,0.017851,0.025427,0.028214,0.028537,0.023996,0.037756,0.020045,...,0.030460,0.030369,0.035826,0.052080,0.057105,0.080281,0.085169,0.101610,0.126006,0.139494
445,5,0.000000,0.009167,0.009697,0.012589,0.013058,0.013226,0.014898,0.015174,0.028433,0.018103,...,0.019271,0.019915,0.021934,0.020112,0.019567,0.019655,0.018728,0.017578,0.017987,0.018030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,242,0.000000,1.366919,1.430469,0.029422,0.032619,0.030597,0.042787,0.046214,0.057206,0.067597,...,0.157345,0.160875,0.168210,0.171885,0.185036,0.177733,0.186591,0.179903,0.183747,0.212211
492,243,0.000000,1.099701,0.009697,0.011619,0.013058,0.013226,0.015052,0.015174,0.014908,0.014812,...,0.019453,0.020766,0.021934,0.020112,0.019906,0.020171,0.019127,0.018131,0.017987,0.018030
492,244,0.000000,0.009613,0.011301,0.013539,0.014310,0.014538,0.015642,0.015674,0.018947,0.014960,...,0.039761,0.042759,0.042088,0.042418,0.042666,0.035804,0.032693,0.032309,0.049032,0.037846
492,245,0.000000,0.755424,2.479162,4.422010,7.200095,5.574021,2.070196,0.294606,0.310444,0.391680,...,12.175900,10.848603,36.012710,27.432978,24.526728,18.529865,8.097266,9.904621,14.033304,9.300242


In [32]:
ModelList[12]['predictions_calib_df'] = ModelList[12]['predictions_calib_df'].applymap(lambda x: np.exp(x) - 1)
ModelList[13]['predictions_calib_df'] = ModelList[13]['predictions_calib_df'].applymap(lambda x: np.exp(x) - 1)
ModelList[12]['predictions_test_df'] = ModelList[12]['predictions_test_df'].applymap(lambda x: np.exp(x) - 1)
ModelList[13]['predictions_test_df'] = ModelList[13]['predictions_test_df'].applymap(lambda x: np.exp(x) - 1)


# Evaluate models

In [33]:
prediction_df = 'predictions_calib_df'
# prediction_df = 'predictions_test_df'

In [34]:
def calculate_mse(ModelList, prediction_df):
    for model in ModelList:
        df = model[prediction_df]
        pred_cols = [f'step_pred_{str(i)}' for i in steps]
        df['mse'] = df.apply(lambda row: mean_squared_error([row['ged_sb_dep']] * 36, 
                            [row[col] for col in pred_cols]), axis=1)

def get_model_mse(ModelList, prediction_df):
    model_mse = {'model': [], 'mse': []}
    for model in ModelList:
        name = model['modelname']
        df = model[prediction_df]
        model_mse['model'].append(name)
        model_mse['mse'].append(df['mse'].mean()) 
    df_model_mse = pd.DataFrame(model_mse)
    return df_model_mse

def get_top_10_cases(ModelList, prediction_df):
    top_10_cases = {'model': [], 'month_id': [], 'country_id': [], 'mse': []}
    for model in ModelList:
        name = model['modelname']
        df = model[prediction_df]
        df_sorted_model = df.sort_values(by=['mse'], ascending=False).head(10)
        for _ in range(10):
            top_10_cases['model'].append(name)
        for month in df_sorted_model.index.get_level_values(level=0):
            top_10_cases['month_id'].append(month)
        for country in df_sorted_model.index.get_level_values(level=1):
            top_10_cases['country_id'].append(country)  
        for mse in df_sorted_model['mse']:
            top_10_cases['mse'].append(mse)
    pd_top_10_cases = pd.DataFrame(top_10_cases)
    pd_top_10_cases.set_index('model', inplace=True)
    return pd_top_10_cases


In [35]:
calculate_mse(ModelList, 'predictions_calib_df')
calculate_mse(ModelList, 'predictions_test_df')

## calib_df

In [37]:
get_model_mse(ModelList, 'predictions_calib_df') 

Unnamed: 0,model,mse
0,fatalities003_nl_baseline_rf,226794.434086
1,fatalities003_nl_conflicthistory_rf,233468.644101
2,fatalities003_nl_conflicthistory_hurdle_lgb,208824.486308
3,fatalities003_nl_conflicthistory_long_xgb,527213.482069
4,fatalities003_nl_vdem_hurdle_xgb,710133.395333
5,fatalities003_nl_wdi_rf,218310.056328
6,fatalities003_nl_topics_rf,270199.737508
7,fatalities003_nl_topics_xgb,478322.044438
8,fatalities003_nl_topics_hurdle_lgb,229854.243255
9,fatalities003_nl_joint_broad_rf,301660.673825


In [38]:
get_model_mse(ModelList, 'predictions_calib_df').sort_values(by=['mse'])

Unnamed: 0,model,mse
2,fatalities003_nl_conflicthistory_hurdle_lgb,208824.486308
5,fatalities003_nl_wdi_rf,218310.056328
13,fatalities003_nl_joint_narrow_hurdle_lgb,225942.52528
0,fatalities003_nl_baseline_rf,226794.434086
12,fatalities003_nl_joint_narrow_hurdle_xgb,228206.830828
11,fatalities003_joint_narrow_xgb,228245.11615
8,fatalities003_nl_topics_hurdle_lgb,229854.243255
1,fatalities003_nl_conflicthistory_rf,233468.644101
6,fatalities003_nl_topics_rf,270199.737508
9,fatalities003_nl_joint_broad_rf,301660.673825


In [39]:
get_top_10_cases(ModelList, 'predictions_calib_df')

Unnamed: 0_level_0,month_id,country_id,mse
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fatalities003_nl_baseline_rf,420,220,343347462.743495
fatalities003_nl_baseline_rf,408,220,339572761.048223
fatalities003_nl_baseline_rf,444,220,288867512.324259
fatalities003_nl_baseline_rf,432,220,276694196.499313
fatalities003_nl_baseline_rf,398,220,31257369.266348
...,...,...,...
fatalities003_nl_all_pca3_xgb,422,70,71738516.922842
fatalities003_nl_all_pca3_xgb,434,220,54954921.294722
fatalities003_nl_all_pca3_xgb,433,220,53884778.173239
fatalities003_nl_all_pca3_xgb,438,220,52900032.317508


## test_df

In [40]:
get_model_mse(ModelList, 'predictions_test_df') 

Unnamed: 0,model,mse
0,fatalities003_nl_baseline_rf,49273.511126
1,fatalities003_nl_conflicthistory_rf,160455.801483
2,fatalities003_nl_conflicthistory_hurdle_lgb,97776.831939
3,fatalities003_nl_conflicthistory_long_xgb,665908.011413
4,fatalities003_nl_vdem_hurdle_xgb,96018.694591
5,fatalities003_nl_wdi_rf,48650.088309
6,fatalities003_nl_topics_rf,164361.410425
7,fatalities003_nl_topics_xgb,256316.802069
8,fatalities003_nl_topics_hurdle_lgb,66437.400058
9,fatalities003_nl_joint_broad_rf,328805.902059


In [41]:
get_model_mse(ModelList, 'predictions_test_df').sort_values(by=['mse'])

Unnamed: 0,model,mse
13,fatalities003_nl_joint_narrow_hurdle_lgb,32225.830488
12,fatalities003_nl_joint_narrow_hurdle_xgb,34570.185765
11,fatalities003_joint_narrow_xgb,35070.568879
5,fatalities003_nl_wdi_rf,48650.088309
0,fatalities003_nl_baseline_rf,49273.511126
8,fatalities003_nl_topics_hurdle_lgb,66437.400058
4,fatalities003_nl_vdem_hurdle_xgb,96018.694591
2,fatalities003_nl_conflicthistory_hurdle_lgb,97776.831939
10,fatalities003_nl_joint_broad_hurdle_rf,112196.56029
1,fatalities003_nl_conflicthistory_rf,160455.801483


In [43]:
get_top_10_cases(ModelList, 'predictions_test_df')

Unnamed: 0_level_0,month_id,country_id,mse
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fatalities003_nl_baseline_rf,491,126,35522881.008833
fatalities003_nl_baseline_rf,475,133,11948512.565489
fatalities003_nl_baseline_rf,477,133,11072773.401089
fatalities003_nl_baseline_rf,489,213,9913754.491375
fatalities003_nl_baseline_rf,468,149,9209752.885938
...,...,...,...
fatalities003_nl_all_pca3_xgb,461,220,81984914.902293
fatalities003_nl_all_pca3_xgb,469,220,81275477.184191
fatalities003_nl_all_pca3_xgb,470,220,79672313.412752
fatalities003_nl_all_pca3_xgb,462,220,79381887.018645
