# Notebook to define ensemble for production, cm level
Version developed for ViEWS monthly updates: Fatalities002
## Including ensemble weighting

This notebook defines the ensemble used for production: selects a set of pre-trained models, retrieves and calibrates them, computes weights, and computes and stores the ensemble model predictions.

Models are stored in model storage and most of them specified in the notebook fat_cm_constituentmodels

The notebook draws on the following files in this repository:

Script file: 
    Ensembling.py
    FetchData.py

Lists of models:
    ModelList_cm_{dev_id}.csv (not yet functional)
    List of pickles at local directory (will rewrite to drop dependence on this)

# Note
### Numbers in the models 11, 12, 13 are log values even if the column for model 12, 13 is ged_sb_dep



In [2]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook

# Views 3
from viewser.operations import fetch
from viewser import Queryset, Column
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
# import views_dataviz
from views_runs import storage, ModelMetadata
from views_runs.storage import store, retrieve, fetch_metadata
from views_forecasts.extensions import *

#sklearn
from sklearn.metrics import mean_squared_error

# Other packages
import pickle as pkl

pd.set_option('display.float_format', lambda x: '%.6f' % x)

In [3]:
# Packages from this repository, Tools folder
import sys
sys.path.append('../')
sys.path.append('../Tools')
sys.path.append('../Intermediates')
import os

from Ensembling import CalibratePredictions, RetrieveStoredPredictions, mean_sd_calibrated, gam_calibrated
from FetchData import FetchData, RetrieveFromList
from ViewsEstimators import *


In [4]:
# Common parameters:

dev_id = 'Fatalities003'
run_id = 'Fatalities003'
EndOfHistory = 509
RunGeneticAlgo = True
level = 'cm'
get_future = False

username = os.getlogin()

steps = [*range(1, 36+1, 1)] # Which steps to train and predict for

fi_steps = [1,3,6,12,36]
# Specifying partitions

calib_partitioner_dict = {"train":(121,396),"predict":(397,444)}
test_partitioner_dict = {"train":(121,444),"predict":(445,492)}
future_partitioner_dict = {"train":(121,492),"predict":(493,504)}
calib_partitioner =  views_runs.DataPartitioner({"calib":calib_partitioner_dict})
test_partitioner =  views_runs.DataPartitioner({"test":test_partitioner_dict})
future_partitioner =  views_runs.DataPartitioner({"future":future_partitioner_dict})

Mydropbox = f'/Users/{username}/Dropbox (ViEWS)/ViEWS/'
localpath = f'/Users/{username}/Pickles/'
overleafpath = f'/Users/{username}/Dropbox (ViEWS)/Apps/Overleaf/VIEWS documentation {dev_id}/'

print('User:', username)
print('Dropbox path set to',Mydropbox)
print('Overleaf path set to',overleafpath)
print('Local path set to',localpath)

User: root
Dropbox path set to /Users/root/Dropbox (ViEWS)/ViEWS/
Overleaf path set to /Users/root/Dropbox (ViEWS)/Apps/Overleaf/VIEWS documentation Fatalities003/
Local path set to /Users/root/Pickles/


In [5]:
from ModelDefinitions import DefineEnsembleModels

ModelList = DefineEnsembleModels(level)
    
i = 0
for model in ModelList:
    print(i, model['modelname'], model['data_train'])
    i = i + 1

0 fatalities003_nl_baseline_rf baseline003
1 fatalities003_nl_conflicthistory_rf conflict_ln
2 fatalities003_nl_conflicthistory_hurdle_lgb conflict_ln
3 fatalities003_nl_conflicthistory_long_xgb conflictlong_ln
4 fatalities003_nl_vdem_hurdle_xgb vdem_short
5 fatalities003_nl_wdi_rf wdi_short
6 fatalities003_nl_topics_rf topics_003
7 fatalities003_nl_topics_xgb topics_003
8 fatalities003_nl_topics_hurdle_lgb topics_003
9 fatalities003_nl_joint_broad_rf joint_broad
10 fatalities003_nl_joint_broad_hurdle_rf joint_broad
11 fatalities003_joint_narrow_xgb joint_narrow
12 fatalities003_nl_joint_narrow_hurdle_xgb joint_narrow
13 fatalities003_nl_joint_narrow_hurdle_lgb joint_narrow
14 fatalities003_nl_all_pca3_xgb all_features


In [6]:
stepcols = ['ln_ged_sb_dep']
for step in steps:
    stepcols.append('step_pred_' + str(step))
stored_modelname_calib = level + '_' + ModelList[11]['modelname'] + '_calib'
pd.DataFrame.forecasts.read_store(stored_modelname_calib, run=dev_id)[stepcols]

pr_56_cm_fatalities003_joint_narrow_xgb_calib.parquet


Unnamed: 0_level_0,Unnamed: 1_level_0,ln_ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
397,1,0.000000,0.008420,0.011582,0.012673,0.014179,0.014431,0.014855,0.016646,0.020506,0.017444,...,0.024846,0.026930,0.027107,0.117257,0.025644,0.025099,0.024437,0.022103,0.022669,0.023872
397,2,0.000000,0.008420,0.011476,0.012418,0.014019,0.013617,0.014294,0.016476,0.021644,0.018288,...,0.022748,0.021641,0.022437,0.021148,0.020752,0.019647,0.018917,0.018413,0.019921,0.020009
397,3,0.000000,0.008420,0.011476,0.012418,0.014019,0.013617,0.014294,0.015214,0.015707,0.018288,...,0.022748,0.021641,0.022437,0.021148,0.020752,0.019647,0.018917,0.018413,0.019921,0.020009
397,4,0.000000,0.008659,0.011582,0.012673,0.014421,0.014431,0.017202,0.045097,0.018661,0.022210,...,0.074820,0.036520,0.056549,0.050284,0.061734,0.053394,0.067794,0.065420,0.123116,0.128227
397,5,0.000000,0.008420,0.011476,0.012418,0.014019,0.013617,0.014294,0.016476,0.021644,0.018288,...,0.022748,0.021641,0.022437,0.021148,0.020752,0.019647,0.018917,0.018413,0.019921,0.020009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,242,0.000000,0.017873,0.019947,0.019739,0.025537,0.019502,0.023194,0.020523,0.021112,0.024462,...,0.133979,0.135839,0.137115,0.150589,0.164453,0.165343,0.166977,0.167235,0.175178,0.169526
444,243,0.000000,0.008511,0.011476,0.012418,0.014019,0.013617,0.014294,0.015214,0.015707,0.015991,...,0.023141,0.022411,0.023122,0.021496,0.021082,0.019647,0.019860,0.019287,0.020205,0.020908
444,244,0.000000,0.009407,0.012381,0.012647,0.014203,0.013976,0.014618,0.015567,0.017467,0.018470,...,0.061876,0.107264,0.122521,0.117023,0.057673,0.051980,0.062827,0.053596,0.062296,0.038805
444,245,0.000000,1.889669,2.477441,3.376249,2.985935,3.317934,2.572747,3.731551,4.061934,4.168920,...,1.934937,2.045012,1.900755,2.903455,3.046722,2.139882,2.063412,2.089220,2.078251,1.945472


# Retrieve and calibrate predictions

In [7]:
# Retrieving the predictions for calibration and test partitions
# The ModelList contains the predictions organized by model

ModelList = RetrieveStoredPredictions(ModelList, steps, EndOfHistory, dev_id, level, get_future)

# ModelList = CalibratePredictions(ModelList, EndOfHistory, steps)

0 fatalities003_nl_baseline_rf
pr_56_cm_fatalities003_nl_baseline_rf_calib.parquet
pr_56_cm_fatalities003_nl_baseline_rf_test.parquet
1 fatalities003_nl_conflicthistory_rf
pr_56_cm_fatalities003_nl_conflicthistory_rf_calib.parquet
pr_56_cm_fatalities003_nl_conflicthistory_rf_test.parquet
2 fatalities003_nl_conflicthistory_hurdle_lgb
pr_56_cm_fatalities003_nl_conflicthistory_hurdle_lgb_calib.parquet
pr_56_cm_fatalities003_nl_conflicthistory_hurdle_lgb_test.parquet
3 fatalities003_nl_conflicthistory_long_xgb
pr_56_cm_fatalities003_nl_conflicthistory_long_xgb_calib.parquet
pr_56_cm_fatalities003_nl_conflicthistory_long_xgb_test.parquet
4 fatalities003_nl_vdem_hurdle_xgb
pr_56_cm_fatalities003_nl_vdem_hurdle_xgb_calib.parquet
pr_56_cm_fatalities003_nl_vdem_hurdle_xgb_test.parquet
5 fatalities003_nl_wdi_rf
pr_56_cm_fatalities003_nl_wdi_rf_calib.parquet
pr_56_cm_fatalities003_nl_wdi_rf_test.parquet
6 fatalities003_nl_topics_rf
pr_56_cm_fatalities003_nl_topics_rf_calib.parquet
pr_56_cm_fatali

In [8]:
ModelList[11]['predictions_calib_df'] = ModelList[11]['predictions_calib_df'].applymap(lambda x: np.exp(x) - 1)
ModelList[11]['predictions_calib_df'].rename(columns={'ln_ged_sb_dep':'ged_sb_dep'}, inplace=True)
ModelList[11]['predictions_test_df'] = ModelList[11]['predictions_test_df'].applymap(lambda x: np.exp(x) - 1)
ModelList[11]['predictions_test_df'].rename(columns={'ln_ged_sb_dep':'ged_sb_dep'}, inplace=True)
ModelList[11]['predictions_test_df']

Unnamed: 0_level_0,Unnamed: 1_level_0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
445,1,0.000000,0.009167,0.009891,0.012589,0.013190,0.013403,0.015648,0.015318,0.028433,0.018103,...,0.024727,0.034935,0.030096,0.031895,0.026588,0.028692,0.028179,0.025854,0.027549,0.028055
445,2,0.000000,0.009167,0.009697,0.012589,0.013058,0.013226,0.014898,0.015174,0.028433,0.018103,...,0.019271,0.019915,0.021934,0.020112,0.019567,0.019655,0.018728,0.017578,0.017987,0.018030
445,3,0.000000,0.009167,0.009697,0.011348,0.013058,0.013226,0.014898,0.015174,0.014908,0.014455,...,0.019271,0.019915,0.021934,0.020112,0.019567,0.019655,0.018728,0.017578,0.017987,0.018030
445,4,0.000000,0.012901,0.020516,0.017851,0.025427,0.028214,0.028537,0.023996,0.037756,0.020045,...,0.030460,0.030369,0.035826,0.052080,0.057105,0.080281,0.085169,0.101610,0.126006,0.139494
445,5,0.000000,0.009167,0.009697,0.012589,0.013058,0.013226,0.014898,0.015174,0.028433,0.018103,...,0.019271,0.019915,0.021934,0.020112,0.019567,0.019655,0.018728,0.017578,0.017987,0.018030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,242,0.000000,1.366919,1.430469,0.029422,0.032619,0.030597,0.042787,0.046214,0.057206,0.067597,...,0.157345,0.160875,0.168210,0.171885,0.185036,0.177733,0.186591,0.179903,0.183747,0.212211
492,243,0.000000,1.099701,0.009697,0.011619,0.013058,0.013226,0.015052,0.015174,0.014908,0.014812,...,0.019453,0.020766,0.021934,0.020112,0.019906,0.020171,0.019127,0.018131,0.017987,0.018030
492,244,0.000000,0.009613,0.011301,0.013539,0.014310,0.014538,0.015642,0.015674,0.018947,0.014960,...,0.039761,0.042759,0.042088,0.042418,0.042666,0.035804,0.032693,0.032309,0.049032,0.037846
492,245,0.000000,0.755424,2.479162,4.422010,7.200095,5.574021,2.070196,0.294606,0.310444,0.391680,...,12.175900,10.848603,36.012710,27.432978,24.526728,18.529865,8.097266,9.904621,14.033304,9.300242


In [9]:
ModelList[10]['predictions_calib_df']

Unnamed: 0_level_0,Unnamed: 1_level_0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
397,1,0.000000,3.643311,5.920910,7.879053,5.581651,8.760094,5.448617,5.667788,22.457182,6.099809,...,6.511542,6.077412,47.772427,5.677524,6.851977,5.410686,7.825408,16.408413,8.629659,19.000309
397,2,0.000000,3.643311,8.131008,7.904457,7.781753,11.044792,11.481462,8.091896,5.769181,6.099809,...,20.412523,10.730457,7.552030,7.272524,7.994062,10.491458,6.537792,14.979412,18.192364,46.782261
397,3,0.000000,3.514450,4.638184,5.945802,7.756139,8.718470,7.788080,5.666050,5.679998,6.142528,...,9.844352,12.428825,63.261288,62.905190,83.684967,85.944405,88.437523,65.603859,46.323067,33.823582
397,4,0.000000,3.558230,4.638184,5.842495,5.591039,5.188852,5.259677,5.266651,5.749945,5.804189,...,31.325960,40.759995,33.416431,10.007353,84.963203,16.074759,89.098747,65.808014,38.264713,29.090097
397,5,0.000000,4.113445,6.753009,7.912243,11.613929,12.066689,13.160626,8.471023,10.156227,21.468166,...,29.027868,25.583740,35.072685,35.337509,29.200043,32.002769,24.086016,28.186235,27.635090,40.971165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,242,0.000000,3.537486,4.645221,5.850566,5.693751,48.851048,5.603571,14.881537,339.844360,279.068207,...,40.971451,13.105615,768.811096,467.567169,31.998230,25.599174,64.331314,24.555630,47.100986,63.612919
444,243,0.000000,3.516138,4.638184,15.204786,5.564894,5.025270,5.179839,5.226724,5.620315,5.766045,...,5.534801,5.694414,5.048412,8.436039,5.902206,5.421671,5.708171,6.976517,5.715421,6.534455
444,244,0.000000,6.971416,6.669143,5.641257,5.516807,6.717100,7.876533,5.175279,5.558433,5.747992,...,1151.659668,682.912048,719.775208,799.785645,11.675535,13.164781,24.956394,19.357574,15.430462,28.640648
444,245,0.000000,5606.384766,11864.048828,381.322571,191.086700,2199.679199,762.624512,394.066437,99.535492,232.977051,...,288.405792,161.421585,250.241058,242.042267,392.343323,1238.751709,373.213562,153.002579,2524.446289,649.494019


In [10]:
ModelList[11]['predictions_calib_df']

Unnamed: 0_level_0,Unnamed: 1_level_0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
397,1,0.000000,0.008456,0.011650,0.012754,0.014280,0.014535,0.014966,0.016786,0.020718,0.017597,...,0.025157,0.027296,0.027478,0.124409,0.025975,0.025417,0.024738,0.022349,0.022928,0.024159
397,2,0.000000,0.008456,0.011542,0.012495,0.014118,0.013710,0.014396,0.016612,0.021880,0.018456,...,0.023008,0.021877,0.022690,0.021373,0.020969,0.019841,0.019097,0.018584,0.020120,0.020211
397,3,0.000000,0.008456,0.011542,0.012495,0.014118,0.013710,0.014396,0.015331,0.015831,0.018456,...,0.023008,0.021877,0.022690,0.021373,0.020969,0.019841,0.019097,0.018584,0.020120,0.020211
397,4,0.000000,0.008696,0.011650,0.012754,0.014525,0.014535,0.017351,0.046129,0.018836,0.022459,...,0.077690,0.037195,0.058178,0.051570,0.063679,0.054846,0.070145,0.067607,0.131016,0.136811
397,5,0.000000,0.008456,0.011542,0.012495,0.014118,0.013710,0.014396,0.016612,0.021880,0.018456,...,0.023008,0.021877,0.022690,0.021373,0.020969,0.019841,0.019097,0.018584,0.020120,0.020211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,242,0.000000,0.018034,0.020148,0.019935,0.025866,0.019693,0.023465,0.020735,0.021336,0.024763,...,0.143369,0.145498,0.146960,0.162518,0.178749,0.179797,0.181727,0.182032,0.191459,0.184743
444,243,0.000000,0.008547,0.011542,0.012495,0.014118,0.013710,0.014396,0.015331,0.015831,0.016119,...,0.023410,0.022664,0.023392,0.021729,0.021306,0.019841,0.020059,0.019474,0.020411,0.021128
444,244,0.000000,0.009451,0.012458,0.012727,0.014305,0.014074,0.014725,0.015689,0.017620,0.018641,...,0.063830,0.113228,0.130343,0.124145,0.059369,0.053355,0.064842,0.055058,0.064277,0.039568
444,245,0.000000,5.617181,10.910744,28.260810,18.805006,26.603264,12.101759,40.743790,57.086542,63.645600,...,5.923611,6.729249,5.690944,17.237037,20.046245,7.498433,6.872789,7.078612,6.990484,5.996935


In [11]:
ModelList[12]['predictions_calib_df']

Unnamed: 0_level_0,Unnamed: 1_level_0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
397,1,0.000000,0.014122,0.032286,0.031657,0.014556,0.023268,0.070136,0.096330,0.045696,0.048645,...,0.028014,0.036451,0.028437,0.038081,0.017845,0.016587,0.012811,0.014901,0.014477,0.014666
397,2,0.000000,0.014909,0.049534,0.044159,0.030006,0.020361,0.100653,0.225381,0.376219,0.170326,...,0.063525,0.012735,0.032341,0.079902,0.011249,0.010377,0.024336,0.038316,0.015640,0.015959
397,3,0.000000,0.011843,0.018704,0.011550,0.017866,0.008789,0.012949,0.011399,0.011505,0.011468,...,0.011795,0.017431,0.012113,0.011314,0.011518,0.010629,0.011959,0.010589,0.011358,0.017218
397,4,0.000000,0.008122,0.014983,0.022001,0.008993,0.015384,0.021825,0.024464,0.010139,0.012643,...,0.009501,0.018339,0.032978,0.027440,0.010549,0.009224,0.020776,0.039796,0.012703,0.018206
397,5,0.000000,0.011057,0.050910,0.041537,0.028097,0.023905,0.077486,0.511309,0.143809,0.217904,...,0.075958,0.021257,0.041879,0.113413,0.010661,0.013518,0.028851,0.047585,0.020017,0.020822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,242,0.000000,0.014622,0.014716,0.014308,0.015706,0.012077,0.018626,0.026236,0.025119,0.029836,...,0.039435,0.025941,0.039624,0.029915,0.045532,0.041854,0.095691,0.081643,0.082414,0.039378
444,243,0.000000,0.023654,0.016173,0.010473,0.008405,0.030064,0.018461,0.008304,0.031978,0.025732,...,0.069346,0.020887,0.011734,0.021906,0.029541,0.030086,0.031606,0.033326,0.031702,0.011472
444,244,0.000000,0.063916,0.038730,0.045257,0.052500,0.031539,0.040085,0.054072,0.033069,0.037331,...,0.172098,0.119958,0.166611,0.184910,0.194046,0.233553,0.221601,0.162064,0.092159,0.120524
444,245,0.000000,1.802847,2.009877,3.851048,1.916607,2.287658,1.451762,2.018101,2.151023,3.274719,...,1.712984,1.033729,0.822829,1.035813,1.426152,1.752635,2.019395,1.741394,1.790091,0.810265


In [12]:
ModelList[12]['predictions_calib_df'] = ModelList[12]['predictions_calib_df'].applymap(lambda x: np.exp(x) - 1)
ModelList[13]['predictions_calib_df'] = ModelList[13]['predictions_calib_df'].applymap(lambda x: np.exp(x) - 1)
ModelList[12]['predictions_test_df'] = ModelList[12]['predictions_test_df'].applymap(lambda x: np.exp(x) - 1)
ModelList[13]['predictions_test_df'] = ModelList[13]['predictions_test_df'].applymap(lambda x: np.exp(x) - 1)


In [13]:
ModelList[12]['predictions_calib_df']

Unnamed: 0_level_0,Unnamed: 1_level_0,ged_sb_dep,step_pred_1,step_pred_2,step_pred_3,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9,...,step_pred_27,step_pred_28,step_pred_29,step_pred_30,step_pred_31,step_pred_32,step_pred_33,step_pred_34,step_pred_35,step_pred_36
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
397,1,0.000000,0.014223,0.032813,0.032163,0.014663,0.023541,0.072654,0.101123,0.046757,0.049848,...,0.028410,0.037123,0.028845,0.038816,0.018005,0.016726,0.012893,0.015012,0.014582,0.014774
397,2,0.000000,0.015020,0.050782,0.045149,0.030461,0.020570,0.105893,0.252800,0.456767,0.185692,...,0.065586,0.012816,0.032869,0.083181,0.011312,0.010431,0.024635,0.039059,0.015763,0.016087
397,3,0.000000,0.011913,0.018880,0.011617,0.018026,0.008827,0.013033,0.011464,0.011571,0.011534,...,0.011865,0.017584,0.012187,0.011379,0.011584,0.010685,0.012031,0.010645,0.011423,0.017367
397,4,0.000000,0.008155,0.015096,0.022245,0.009033,0.015503,0.022064,0.024766,0.010190,0.012724,...,0.009546,0.018508,0.033528,0.027820,0.010605,0.009267,0.020994,0.040599,0.012784,0.018373
397,5,0.000000,0.011118,0.052228,0.042411,0.028495,0.024193,0.080568,0.667473,0.154663,0.243467,...,0.078917,0.021485,0.042768,0.120095,0.010718,0.013609,0.029272,0.048735,0.020219,0.021040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,242,0.000000,0.014730,0.014825,0.014411,0.015830,0.012150,0.018801,0.026583,0.025437,0.030286,...,0.040223,0.026281,0.040420,0.030367,0.046584,0.042742,0.100418,0.085068,0.085905,0.040164
444,243,0.000000,0.023936,0.016305,0.010528,0.008441,0.030520,0.018633,0.008339,0.032495,0.026066,...,0.071807,0.021107,0.011803,0.022148,0.029982,0.030543,0.032111,0.033888,0.032210,0.011538
444,244,0.000000,0.066002,0.039489,0.046297,0.053903,0.032042,0.040899,0.055561,0.033622,0.038036,...,0.187794,0.127449,0.181295,0.203111,0.214152,0.263080,0.248073,0.175936,0.096539,0.128087
444,245,0.000000,5.066896,6.462401,46.042315,5.797855,8.851835,3.270634,6.524027,7.593644,25.435783,...,4.545487,1.811531,1.276932,1.817396,3.162648,4.769783,6.533763,4.705291,4.989998,1.248504


# Evaluate models

In [14]:
prediction_df = 'predictions_calib_df'
# prediction_df = 'predictions_test_df'

In [15]:
def calculate_mse(ModelList):
    for model in ModelList:
        df = model[prediction_df]
        pred_cols = [f'step_pred_{str(i)}' for i in steps]
        df['mse'] = df.apply(lambda row: mean_squared_error([row['ged_sb_dep']] * 36, 
                            [row[col] for col in pred_cols]), axis=1)

calculate_mse(ModelList)

In [16]:
def get_model_mse(ModelList):
    model_mse = {'model': [], 'mse': []}
    for model in ModelList:
        name = model['modelname']
        df = model[prediction_df]
        model_mse['model'].append(name)
        model_mse['mse'].append(df['mse'].mean()) 
    df_model_mse = pd.DataFrame(model_mse)
    return df_model_mse

get_model_mse(ModelList) 

Unnamed: 0,model,mse
0,fatalities003_nl_baseline_rf,226794.434086
1,fatalities003_nl_conflicthistory_rf,233468.644101
2,fatalities003_nl_conflicthistory_hurdle_lgb,208824.486308
3,fatalities003_nl_conflicthistory_long_xgb,527213.482069
4,fatalities003_nl_vdem_hurdle_xgb,710133.395333
5,fatalities003_nl_wdi_rf,218310.056328
6,fatalities003_nl_topics_rf,270199.737508
7,fatalities003_nl_topics_xgb,478322.044438
8,fatalities003_nl_topics_hurdle_lgb,229854.243255
9,fatalities003_nl_joint_broad_rf,301660.673825


In [17]:
get_model_mse(ModelList).sort_values(by=['mse'])

Unnamed: 0,model,mse
2,fatalities003_nl_conflicthistory_hurdle_lgb,208824.486308
5,fatalities003_nl_wdi_rf,218310.056328
13,fatalities003_nl_joint_narrow_hurdle_lgb,225942.52528
0,fatalities003_nl_baseline_rf,226794.434086
12,fatalities003_nl_joint_narrow_hurdle_xgb,228206.830828
11,fatalities003_joint_narrow_xgb,228245.11615
8,fatalities003_nl_topics_hurdle_lgb,229854.243255
1,fatalities003_nl_conflicthistory_rf,233468.644101
6,fatalities003_nl_topics_rf,270199.737508
9,fatalities003_nl_joint_broad_rf,301660.673825


In [18]:
def get_top_10_cases(ModelList):
    top_10_cases = {'model': [], 'month_id': [], 'country_id': [], 'mse': []}
    for model in ModelList:
        name = model['modelname']
        df = model[prediction_df]
        df_sorted_model = df.sort_values(by=['mse'], ascending=False).head(10)
        for _ in range(10):
            top_10_cases['model'].append(name)
        for month in df_sorted_model.index.get_level_values(level=0):
            top_10_cases['month_id'].append(month)
        for country in df_sorted_model.index.get_level_values(level=1):
            top_10_cases['country_id'].append(country)  
        for mse in df_sorted_model['mse']:
            top_10_cases['mse'].append(mse)
    pd_top_10_cases = pd.DataFrame(top_10_cases)
    pd_top_10_cases.set_index('model', inplace=True)
    return pd_top_10_cases

get_top_10_cases(ModelList)

Unnamed: 0_level_0,month_id,country_id,mse
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fatalities003_nl_baseline_rf,420,220,343347462.743495
fatalities003_nl_baseline_rf,408,220,339572761.048223
fatalities003_nl_baseline_rf,444,220,288867512.324259
fatalities003_nl_baseline_rf,432,220,276694196.499313
fatalities003_nl_baseline_rf,398,220,31257369.266348
...,...,...,...
fatalities003_nl_all_pca3_xgb,422,70,71738516.922842
fatalities003_nl_all_pca3_xgb,434,220,54954921.294722
fatalities003_nl_all_pca3_xgb,433,220,53884778.173239
fatalities003_nl_all_pca3_xgb,438,220,52900032.317508


# Genetic algorithm

In [None]:
from joblib import Parallel, delayed, cpu_count
from functools import partial
from genetic2 import *

from pathlib import Path

def make_run_from_step (
    step,
    e_set,
    df_name = 'calib_df_calibrated', 
    target = 'ln_ged_sb_dep',
    population_count = 100,
    initial_population = None,
    base_genes = np.array([0,1]),
    number_of_generations = 500
):
    """
    step : step you want as an int,
    ensemble_set : structure of the EnsembleList type,
    target = Y in prediction,
    df_name = name of the df in the ensemble set you want.
    """
    
    df_step = f'step_pred_{step}'
    
    try: 
        del aggregate_df
    except NameError:
        pass 
    
    for i_ens in ModelList:
        try:
            #Join the step from the model into the ensemble df if it exists.
            aggregate_df = aggregate_df.join(i_ens[df_name][[df_step]], rsuffix=f'_{i_ens["modelname"]}')
        except NameError:
            #If the ensemble df does not exist create it and include the target.
            aggregate_df = i_ens[df_name][[target,df_step]].copy()
            aggregate_df = aggregate_df.rename(columns = {df_step : f'{df_step}_{i_ens["modelname"]}'})
    
    aggregate_df = aggregate_df.dropna()
    aggregate_df = aggregate_df[aggregate_df.columns[~aggregate_df.columns.str.contains('ensemble')]]
    
    X = aggregate_df.copy(); del X[target]
    Y = aggregate_df[target]
    
    inst_mse = partial(weighted_mse_score, Y, X, mean_squared_error)
    if initial_population is None:
        population =  init_population_sum(population_count,base_genes,X.shape[1],0.5,3)
    else: 
        population = initial_population
    
    from genetic2 import temp_file_name
    import os
    Path('./exploration_pickle/').mkdir(parents=True, exist_ok=True) 
    pd.DataFrame({'step':[step], 'memoization_id':[temp_file_name]}).to_csv(f'exploration_pickle/id_{temp_file_name}.csv', index=False)
    
    generation = genetic_algorithm(population, 
                                   inst_mse, 
                                   base_genes, 
                                   f_thres=None, 
                                   ngen=number_of_generations, 
                                   pmut=0.2)
    return {'step':step, 'memoization_id':temp_file_name, 'generation':generation}
    

In [None]:
super_walrus_genes = np.array([0, 0.010, 0.015, 0.020, 0.025, 0.030, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.25, 0.30])
nonlogged_genes = np.array([0, 0.02, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.9, 1, 1.1, 1.2, 1.5, 2.0])#, 1, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5])
print(len(nonlogged_genes))
steps_to_optimize = [1,2,3,4,6,9,12,15,18,24,30,36]

In [None]:
filled_function = partial(make_run_from_step, 
    e_set = ModelList,
    df_name = 'predictions_calib_df', # Non-logged version
    target = 'ln_ged_sb_dep',
    population_count = 100,
    initial_population = None,
    base_genes = super_walrus_genes,
    number_of_generations = 200
)



In [None]:
cpus = cpu_count()-4 if cpu_count()>2 else 1
cpus - len(steps_to_optimize)

In [None]:
# 15 models, 24 genes, 12 steps, 100 generations takes 41 minutes
if RunGeneticAlgo:

    ct = datetime.now()
    print('Estimating genetic weights, current time:', ct)
    generations = Parallel(n_jobs=cpus)(delayed(filled_function)(i) for i in steps_to_optimize)
    ct = datetime.now()
    print('Done estimating weights, current time:', ct)
    with open('exploration_pickle/full_gen.pickle', 'wb') as handle:
        pkl.dump(generations, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    picklename = 'exploration_pickle/full_gen.pickle'
    generations = pkl.load( open (picklename, "rb") )
    

In [None]:
# Print the memoization id's so that you can explore the training process in the visualizer
for i in generations:
    print (i['step'], i['memoization_id'])

In [None]:
# Fetch the best organism.
GeneticAlgoResult = []
for gen in generations:
    print ('\nStep: ',gen['step'],'\n','*'*24,'\n')
    print (gen['generation'][0])
    #The best is always the top organism. You can get the top 20 by slicing gen['generation'][0:20] and so on
    linedict = {
        'Org': gen['generation'][0][0],
        'Fitness': gen['generation'][0][1],
        'Step': gen['step']
    }
    GeneticAlgoResult.append(linedict)
print(GeneticAlgoResult)

# Assignment of the genetic weights

In [None]:
# Reading from GeneticAlgoResult:
w_step = [None] * 37
for line in GeneticAlgoResult:
    w_step[line['Step']] = line['Org']
i=2
for i in [1,2,3,6,9,12,18,24,30,36]:
    w_step[i]
    print(sum(w_step[i]))




In [None]:
# Linear interpolation of weights:
print(steps_to_optimize)
WeightMatrix = [None] * 37
modelnames = []
for model in ModelList: 
    modelnames.append(model['modelname'])
for step in steps:
    if step in steps_to_optimize:
#        print(step, 'is optimized')
        WeightMatrix[step] = w_step[step]
    else:
        WeightMatrix[step] = np.nan * len(w_step[1])


In [None]:
StepAssigner = [1,2,3,4,4,6,6,9,9,9,12,12,12,15,15,15,18,18,18,18,18,24,24,24,24,24,24,30,30,30,30,30,30,36,36,36]
WeightMatrix = [None] * 37

stepcols = ['ln_ged_sb_dep']
for step in steps:
    stepcols.append('step_pred_' + str(step))
modelnames = []
for model in ModelList: 
    modelnames.append(model['modelname'])

for step in steps:
#    print('Step',step,'assigned',StepAssigner[step-1])
    WeightMatrix[step] = w_step[StepAssigner[step-1]]
wmt = np.array(WeightMatrix[1:]).T
weights_df = pd.DataFrame(wmt,columns=stepcols[1:],index=modelnames)
weights_df

In [None]:
# Interpolated weights
i_weights_df = weights_df.copy()
for step in steps:
    col = 'step_pred_' + str(step)
    if step == 5:
        prestepcol = 'step_pred_' + str(step-1)
        
        poststepcol = 'step_pred_' + str(step+1)
        i_weights_df[col] = (i_weights_df[prestepcol] + i_weights_df[poststepcol]) / 2
    if step == 7 or step == 10 or step == 13 or step == 16:
        prestepcol = 'step_pred_' + str(step-1)
        poststepcol = 'step_pred_' + str(step+2)
        i_weights_df[col] = ((i_weights_df[prestepcol]*2) + (i_weights_df[poststepcol]*1)) / 3
    if step == 8 or step == 11 or step == 14 or step == 17:
        prestepcol = 'step_pred_' + str(step-2)
        poststepcol = 'step_pred_' + str(step+1)
        i_weights_df[col] = ((i_weights_df[prestepcol]*1) + (i_weights_df[poststepcol]*2)) / 3
    if step == 19 or step == 25 or step == 31:
        prestepcol = 'step_pred_' + str(step-1)
        poststepcol = 'step_pred_' + str(step+5)
        i_weights_df[col] = ((i_weights_df[prestepcol]*5) + (i_weights_df[poststepcol]*1)) / 6
    if step == 20 or step == 26 or step == 32:
        prestepcol = 'step_pred_' + str(step-2)
        poststepcol = 'step_pred_' + str(step+3)
        i_weights_df[col] = ((i_weights_df[prestepcol]*4) + (i_weights_df[poststepcol]*2)) / 6
    if step == 21 or step == 27 or step == 33:
        prestepcol = 'step_pred_' + str(step-3)
        poststepcol = 'step_pred_' + str(step+3)
        i_weights_df[col] = ((i_weights_df[prestepcol]*3) + (i_weights_df[poststepcol]*3)) / 6
    if step == 22 or step == 28 or step == 34:
        prestepcol = 'step_pred_' + str(step-4)
        poststepcol = 'step_pred_' + str(step+2)
        i_weights_df[col] = ((i_weights_df[prestepcol]*2) + (i_weights_df[poststepcol]*4)) / 6
    if step == 23 or step == 29 or step == 35:
        prestepcol = 'step_pred_' + str(step-5)
        poststepcol = 'step_pred_' + str(step+1)
        i_weights_df[col] = ((i_weights_df[prestepcol]*1) + (i_weights_df[poststepcol]*5)) / 6
        
print(steps_to_optimize)
# Export weights 
i_weights_df.to_csv('../Intermediates/GeneticWeights.csv')
i_weights_df
# Save the weights dfs
dflist = [
    (i_weights_df,'i_weights_df'), 
]

path = Mydropbox + 'Projects/PredictingFatalities/MSEs/'
for df in dflist:
    filename = path + df[1] + '.csv'
    df[0].to_csv(filename)
    

In [None]:
import seaborn as sns
palette = 'vlag'
palette = sns.color_palette('BrBG',n_colors=50)
palette = sns.cubehelix_palette(start=2, rot=0, dark=0, light=1, n_colors=100)

fig, ax =plt.subplots(1,figsize=(16,11))
ax = sns.heatmap(i_weights_df, xticklabels=2, linewidths=.5, cmap=palette,square=True)
filename = overleafpath + 'Evaluation/Figures/genetic_weights.png'
if user == havardhegre1:
    plt.savefig(filename, dpi=300)

In [None]:
# Constructing dfs to hold the predictions
# A list of dictionaries organizing predictions and information as one step per entry,
# including a dataframe for each step with one column per prediction model
StepEnsembles = []
for col in stepcols[1:]:  # Use the baseline as template to construct object
    Step_prediction = {
        'step_pred': col,
        'df_calib': pd.DataFrame(ModelList[0]['calib_df_calibrated']['ln_ged_sb_dep']), 
        'df_test': pd.DataFrame(ModelList[0]['test_df_calibrated']['ln_ged_sb_dep']),
        'ensembles_calib': pd.DataFrame(ModelList[0]['calib_df_calibrated']['ln_ged_sb_dep']),
        'ensembles_test': pd.DataFrame(ModelList[0]['test_df_calibrated']['ln_ged_sb_dep'])
    }
    for model in ModelList:
        modelname = model['modelname']
        Step_prediction['df_calib'][modelname] = model['calib_df_calibrated'][col]
        Step_prediction['df_test'][modelname] = model['test_df_calibrated'][col]
    StepEnsembles.append(Step_prediction)

# Calculating unweighted average ensembles
i = 0
for col in stepcols[1:]:
    # Unweighted average
    StepEnsembles[i]['ensembles_test']['unweighted_average'] = StepEnsembles[i]['df_test'].drop('ln_ged_sb_dep', axis=1).mean(axis=1)
    StepEnsembles[i]['ensembles_calib'].loc['unweighted_average'] = StepEnsembles[i]['df_calib'].drop('ln_ged_sb_dep', axis=1).mean(axis=1)
    i = i + 1


In [None]:
StepEnsembles[0]['ensembles_test']['unweighted_average']

In [None]:

# Calculating weighted average ensembles
# Based on the weights_df dataframe filled with Mihai's weights above

def ensemble_predictions(yhats, weights):
    # make predictions
    yhats = np.array(yhats)
    # weighted sum across ensemble members
    result = np.dot(weights,yhats)
    return result

# normalize a vector to have unit norm
def normalize(weights):
    # calculate l1 vector norm
    result = norm(weights, 1)
    # check for a vector of all zeros
    if result == 0.0:
        return weights
    # return normalized vector (unit norm)
    return weights / result

i = 0
for col in stepcols[1:]:
    # Unweighted average
    df_calib = StepEnsembles[i]['df_calib'].drop('ln_ged_sb_dep', axis=1)
    df_test = StepEnsembles[i]['df_test'].drop('ln_ged_sb_dep', axis=1)
    StepEnsembles[i]['ensembles_calib']['weighted_average'] = (df_calib*i_weights_df[col]).sum(axis=1)
    StepEnsembles[i]['ensembles_test']['weighted_average'] =  (df_test*i_weights_df[col]).sum(axis=1)
    i = i + 1

In [None]:
# Reshape the ensemble predictions
EnsembleList = []
genetic = {
        'modelname': 'ensemble_genetic',
        'algorithm': '',
        'depvar': "ln_ged_sb_dep",
        'calib_df_calibrated': ModelList[0]['calib_df_calibrated'].copy(),
        'test_df_calibrated': ModelList[0]['test_df_calibrated'].copy(),
    }    

for step in StepEnsembles:
    colname = step['step_pred']
    print(colname)
    genetic['calib_df_calibrated'][colname] = step['ensembles_calib']['weighted_average']
    genetic['test_df_calibrated'][colname] = step['ensembles_test']['weighted_average']

EnsembleList.append(genetic)


In [None]:
# Save ensemble predictions
predstore_calib = level +  '_' + genetic['modelname'] + '_calib'
genetic['calib_df_calibrated'].forecasts.set_run(run_id)
genetic['calib_df_calibrated'].forecasts.to_store(name=predstore_calib, overwrite = True)
predstore_test = level +  '_' + genetic['modelname'] + '_test'
genetic['test_df_calibrated'].forecasts.set_run(run_id)
genetic['test_df_calibrated'].forecasts.to_store(name=predstore_test, overwrite = True)

In [None]:
# See which genetic ensembles are in prediction storage
ViewsMetadata().with_name('genetic').fetch()