# ViEWS 3 ensembles: future predictions
UK FCDO Fatalities project, cm level

This notebook produces future predictions for a set of models defined in the list of dictionaries ModelList and the weights stored as iweights_df.csv. Both of these are produced by the notebook fatal_cm_compute_ensemble in this repository. 

The notebook draws on the following .py script files in this repository:

Ensembling.py

FetchData.py

HurdleRegression.py

It also requires the list of models included in the ensemble, in the following file:

EnsembleMetaData_cm_ + [run_id]

In [None]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

# Views 3
from viewser.operations import fetch
from viewser import Queryset, Column
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
import views_dataviz
from views_runs import storage, ModelMetadata
from views_runs.storage import store, retrieve, fetch_metadata
from views_forecasts.extensions import *

# Mapper
import geopandas as gpd

from views_dataviz.map import mapper, utils
from views_dataviz import color
from views_dataviz.map.presets import ViewsMap

import sqlalchemy as sa
#from ingester3.config import source_db_path

# Other packages
import pickle as pkl

#Parallelization
from joblib import Parallel, delayed, cpu_count
from functools import partial
from genetic2 import *

from pathlib import Path

# Predicting fatalities scripts
from HurdleRegression import *
from Ensembling import CalibratePredictions, RetrieveStoredPredictions, mean_sd_calibrated, gam_calibrated
from FetchData import FetchData, RetrieveFromList

In [None]:
# Common parameters:

dev_id = 'Fatalities001'
run_id = 'Fatalities001' 
EndOfHistory = 508
prod_id = '2022_04_t01'
#run_id = dev_id + '_' + prod_id
RunGeneticAlgo = False
level = 'cm'
WriteToOverleaf = True

steps = [*range(1, 36+1, 1)] # Which steps to train and predict for

#steps = [1,2,3,4,5,6,7,8,9,10,11,12,15,18,21,24] # Which steps to train and predict for
#fi_steps = [1,3,6,12,36] # Which steps to present feature importances for
#steps = [1,12,24,36]
fi_steps = [1,3,6,12,36]
#steps = [1,6,36]
#fi_steps = [1,6,36]

# Specifying partitions

calib_partitioner_dict = {"train":(121,396),"predict":(397,444)}
test_partitioner_dict = {"train":(121,444),"predict":(445,492)}
future_partitioner_dict = {"train":(121,492),"predict":(493,504)}
calib_partitioner =  views_runs.DataPartitioner({"calib":calib_partitioner_dict})
test_partitioner =  views_runs.DataPartitioner({"test":test_partitioner_dict})
future_partitioner =  views_runs.DataPartitioner({"future":future_partitioner_dict})

# Specifying paths - note these have to be set to conform to individual setups!

Mydropbox = '/Users/havardhegre/Dropbox (ViEWS)/ViEWS/'
localgitpath = '/Users/havardhegre/views3/'

if WriteToOverleaf:
    overleafpath = '/Users/havardhegre/Dropbox (ViEWS)/Apps/Overleaf/ViEWS predicting fatalities/'


# Retrieve models and predictions

In [None]:

gitname = 'EnsembleMetaData_cm_' + dev_id + '.csv'
EnsembleMetaData = pd.read_csv(gitname)
ModelList = EnsembleMetaData.to_dict('records')
i = 0
for model in ModelList:
    print(i, model['modelname'])
    i = i + 1

# Retrieve and calibrate predictions and data

In [None]:
## Running and saving David's models
# Import subprocess to run Rscript
import subprocess

# Fetch and save data (can perhaps be simplified?)
qs = Queryset('hh_20_features','country_month')
qs.fetch().to_parquet('markov/tmp.parquet')

# Set commands and arguments. R-scripts located in 'Markov'-folder
command ='Rscript'
path2script ='markov/omm_ranger_hh20_fcdo_py.R'
cmd = [command, path2script]
data_path = localgitpath + 'FCDO_predicting_fatalities/markov/' + 'tmp.parquet'
save_path = Mydropbox + 'Projects/PredictingFatalities/Predictions/cm/preds/'
args = [str(EndOfHistory),data_path,save_path]

# Run subprocess. Saves the predictions as csv-files to the save_path location with prefix vmm_[estimator]_hh20_[EndOfHistory]
subprocess.call(cmd+args)


In [None]:
# Retrieve David's models from dropbox and store in prediction storage
path = Mydropbox + 'Projects/PredictingFatalities/Predictions/cm/preds/'

DRList = [
    {
        'modelname': 'fat_hh20_Markov_glm',
        'filename': path + 'vmm_glm_hh20_' + str(EndOfHistory) + '.csv'
    },
    
    {
        'modelname': 'fat_hh20_Markov_rf',
        'filename': path + 'vmm_rf_hh20_' + str(EndOfHistory) + '.csv'
    }
]
    
for model in DRList:
    df_future = pd.read_csv(model['filename'],index_col=['month_id','country_id'])
    df_future['ln_ged_sb_dep'] = np.nan # Empty dependent variable column for consistency/required by prediction storage function
    stored_modelname = level + '_' + model['modelname'] + '_f' + str(EndOfHistory)
    df_future.forecasts.set_run(run_id)
    df_future.forecasts.to_store(name=stored_modelname, overwrite=True)    

In [None]:
# Retrieving the predictions for calibration and test partitions
# The ModelList contains the predictions organized by model

ModelList = RetrieveStoredPredictions(ModelList, steps, EndOfHistory, run_id)

ModelList = CalibratePredictions(ModelList, EndOfHistory, steps)

In [None]:
# Run querysets and postprocessing (e.g. PCA) to obtain data for future prediction
# Returns as 'Datasets'; a list of dataframes
Datasets = FetchData(dev_id)

In [None]:


from views_runs import Storage, StepshiftedModels
from views_partitioning.data_partitioner import DataPartitioner
from viewser import Queryset, Column
from views_runs import operations
from views_runs.run_result import RunResult

from pygam import LogisticGAM, LinearGAM, s, te

RewritePredictions = True # Set this to True to rewrite predictions even if they exist

def RetrainAndPredict(modelname):
    force_retrain = False
    modelstore = storage.Storage()
    # Predictions for true future
    ct = datetime.now()
    print('Future', ct)
    modelstore = storage.Storage()
    model['RunResult_future']  = RunResult.retrain_or_retrieve(
            retrain            = force_retrain,
            store              = modelstore,
            partitioner        = DataPartitioner({"test":future_partitioner_dict}),
            stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
            dataset            = RetrieveFromList(Datasets,model['data_train']),
            queryset_name      = model['queryset'],
            partition_name     = "test",
            timespan_name      = "train",
            storage_name       = model['modelname'] + '_future',
            author_name        = "HH",
    )       
    predictions_future = model['RunResult_future'].run.future_point_predict(EndOfHistory,model['RunResult_future'].data)
    return predictions_future



i = 0
print('Computing predictions, production run ' + prod_id + ', development run ' + run_id)
for model in ModelList:

    # Loop that checks whether (1) this a model trained outside the main system, 
    # (2) retrieves the prediction if it exists in prediction storage,
    # (3) if not checks whether the trained model exists, retrains if not, 
    # Then calibrates the predictions and stores them if they have not been stored before for this run.
    # To do: set the data_preprocessing to the function in the model dictionary
    
    model['predstorename_ncal'] = level +  '_' + model['modelname'] + '_noncalibrated' + '_f' + str(EndOfHistory)
    model['predstorename_cal'] = level +  '_' + model['modelname'] + '_calibrated' + '_f' + str(EndOfHistory)

    
    if 'Markov' not in model['modelname']: # Only Markov models are currently exceptions
        print(i, model['modelname'])

        ct = datetime.now()
        print('Trying to retrieve non-calibrated predictions', ct)
        if RewritePredictions:
            model['future_df_noncalibrated'] = RetrainAndPredict(model['predstorename_ncal'])
        else:
            try:
                model['future_df_noncalibrated'] = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstorename_ncal'])
                print('Predictions for ', model['predstorename_ncal'], ', run', run_id, 'exist, retrieving from prediction storage')

            except KeyError:
                print(model['predstorename_ncal'], ', run', run_id, 'does not exist, predicting')
                model['future_df_noncalibrated'] = RetrainAndPredict(model['predstorename_ncal'])

        # Calibrating and storing   
        # Storing non-calibrated
        
        model['future_df_noncalibrated'].forecasts.set_run(run_id)
        model['future_df_noncalibrated'].forecasts.to_store(name=model['predstorename_ncal'], overwrite=True)   
        print('Calibrating')
        model['future_df_calibrated'] = model['future_df_noncalibrated'].copy()
        for step in steps:
            thismonth = EndOfHistory + step
            
            model['future_df_calibrated'].loc[thismonth,'step_combined'] = pd.DataFrame(model['calibration_gams'][step-1]['calibration_GAM'].predict(model['future_df_noncalibrated'].loc[thismonth])).values
         # Storing calibrated
        model['future_df_calibrated'].forecasts.set_run(run_id)
        model['future_df_calibrated'].forecasts.to_store(name=model['predstorename_cal'], overwrite=True)   
            
    else: # If one of David's Markov models
        print(i, model['modelname'])
            
        model['predstorename_noncalibrated'] = level +  '_' + model['modelname'] + '_noncalibrated' + '_f' + str(EndOfHistory)
        print(model['predstorename_noncalibrated'], ', run', run_id, 'is being retrieved from dropbox')
        path = Mydropbox + 'Projects/PredictingFatalities/Predictions/cm/preds/'

        if model['modelname'] == 'fat_hh20_Markov_glm':
            DR_filename = path + 'vmm_glm_hh20_' + str(EndOfHistory) + '.csv'
            model['future_df_calibrated'] = pd.read_csv(DR_filename,index_col=['month_id','country_id'])
        if model['modelname'] == 'fat_hh20_Markov_rf':
            DR_filename = path + 'vmm_rf_hh20_' + str(EndOfHistory) + '.csv'
            model['future_df_calibrated'] = pd.read_csv(DR_filename,index_col=['month_id','country_id'])
            
        model['predstorename_cal'] = level +  '_' + model['modelname'] + '_calibrated' + '_f' + str(EndOfHistory)

        model['future_df_calibrated'].forecasts.set_run(run_id)
        model['future_df_calibrated'].forecasts.to_store(name=model['predstorename_cal'], overwrite=True)   


    i = i + 1

print('All done')
        
        

In [None]:
#df = RetrieveFromList(Datasets,model['data_train'])
model['data_train']

In [None]:
EnsembleList = [] # Separate list of dictionaries for ensembles!

Ensemble = {
    'modelname':            'genetic_ensemble',
    'algorithm':            [],
    'depvar':               'ln_ged_sb_dep',
    'data_train':           [],
    'Algorithm_text':       '',
    'calibration_gams':     [],
    'future_df_calibrated': [],
}
EnsembleList.append(Ensemble)


In [None]:
# Collecting in one df, one column per model
ConstituentModels_df = pd.DataFrame(ModelList[0]['future_df_calibrated']['step_combined'])
ConstituentModels_df.columns = [ModelList[0]['modelname']]
for model in ModelList[1:]:
    ConstituentModels_df[model['modelname']] = pd.DataFrame(model['future_df_calibrated']['step_combined'])


In [None]:
# Retrieve genetic algorithm results
i_weights_df = pd.read_csv('GeneticWeights.csv')

# Retrieve ensemble predictions for test partition to create categorical predictions

In [None]:
stored_modelname_test = level + '_' + 'ensemble_genetic' + '_test'

ensemble_test_df = pd.DataFrame.forecasts.read_store(stored_modelname_test, run=run_id)
ensemble_test_df.replace([np.inf, -np.inf], 0, inplace=True)  

ensemble_test_df.head()

In [None]:
# Generate dichotomous version of dependent variable
ensemble_test_df['ged_gte_25'] = ensemble_test_df['ln_ged_sb_dep'].apply(lambda x: 1 if x >= np.log1p(25) else 0)
# Generate multiclass version for uncertainty estimation
def ged_categorical(x):
    if x < np.log1p(0.5):
        return 0
    elif x < np.log1p(10):
        return 1
    elif x < np.log1p(100):
        return 2
    elif x < np.log1p(1000):
        return 3
    else :
        return 4

ensemble_test_df['ged_multi'] = ensemble_test_df['ln_ged_sb_dep'].apply(ged_categorical)

ensemble_test_df.describe()

In [None]:
plt.scatter(ensemble_test_df['ln_ged_sb_dep'],ensemble_test_df['ged_multi'])

In [None]:
# Train model to transform predictions from  fatalities to (1) dichotomous and (2) multiclass
from sklearn.linear_model import LogisticRegression
dichotomous_classifiers = []
multi_classifiers = []
for step in steps:
    X = np.array(ensemble_test_df[f'step_pred_{step}'])
    X = X.reshape(-1,1)
    # Dichotomous
    y_dich = np.array(ensemble_test_df['ged_gte_25']).reshape(-1, 1)
    dich_clf = LogisticRegression(random_state=0).fit(X, y_dich)
    dichotomous_classifiers.append(dich_clf)
    p_dich = dich_clf.predict_proba(X)
    ensemble_test_df['dich_step_{step}_logit'] = p_dich[:,1].ravel()
    # Multiclass
    y_multi = np.array(ensemble_test_df['ged_multi']).reshape(-1, 1)
    multi_clf = LogisticRegression(random_state=0).fit(X, y_multi)
    multi_classifiers.append(multi_clf)
    p_multi = multi_clf.predict_proba(X)
    for cls in [0,1,2,3,4]:
        ensemble_test_df[f'multi_{cls}_step_{step}_logit'] = p_multi[:,cls].ravel()

ensemble_test_df.describe()

# Calculating and storing ensemble future predictions

In [None]:
# Setting up a placeholder df for ensemble predictions
EnsembleList[0]['future_df_calibrated'] = ModelList[0]['future_df_calibrated'].copy() # Copy from baseline
EnsembleList[0]['future_df_dichotomous'] = ModelList[0]['future_df_calibrated'].copy() # Copy from baseline

ConstituentModels_df_w = ConstituentModels_df.copy()

for step in steps:
    month = EndOfHistory + step
    weightcol = 'step_pred_' + str(step)
    weights = np.array(pd.DataFrame(i_weights_df[weightcol]))
    EnsembleList[0]['future_df_calibrated'].loc[month] = ConstituentModels_df_w.loc[month].dot(weights).values
    x_d = np.array(EnsembleList[0]['future_df_calibrated'].loc[month]).reshape(-1,1)
    pred_step = dichotomous_classifiers[step-1].predict_proba(x_d)
    EnsembleList[0]['future_df_dichotomous']['step_combined'].loc[month] = pred_step[:,1]

In [None]:
# Storing the ensemble future predictions
predstore_future = level +  '_' + EnsembleList[0]['modelname'] + '_f' + str(EndOfHistory)
EnsembleList[0]['future_df_calibrated'].forecasts.set_run(run_id)
EnsembleList[0]['future_df_calibrated'].forecasts.to_store(name=predstore_future, overwrite = True) 
predstore_future_dich = level +  '_' + EnsembleList[0]['modelname'] + '_dich_f' + str(EndOfHistory)
EnsembleList[0]['future_df_dichotomous'].forecasts.set_run(run_id)
EnsembleList[0]['future_df_dichotomous'].forecasts.to_store(name=predstore_future_dich, overwrite = True) 


In [None]:
ViewsMetadata().with_name('genetic').fetch()

# Mapping future predictions

In [None]:
from matplotlib import pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import contextily as ctx

from views_dataviz import color
from views_dataviz.map import utils
from views_dataviz.map.presets import ViewsMap

import sqlalchemy as sa
#from ingester3.config import source_db_path
#from ingester3.Country import Country
#from ingester3.extensions import *
#from ingester3.ViewsMonth import ViewsMonth

import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import geopandas as gpd
import pandas as pd
import numpy as np

class Mapper2:
    """
    `Map` takes basic properties and allows the user to consecutively add
    layers to the Map object. This makes it possible to prepare mapping
    "presets" at any level of layeredness that can be built on further.
    
    Mapper2 allows for the customizable addition of scaling to the map. 
    -re-add the code for labels later when i can test it

    Attributes
    ----------
    width: Integer value for width in inches.
    height: Integer value for height in inches.
    bbox: List for the bbox per [xmin, xmax, ymin, ymax].
    frame_on: Bool for whether to draw a frame around the map.
    title: Optional default title at matplotlib's default size.
    figure: Optional tuple of (fig, size) to use if you want to plot into an
        already existing fig and ax, rather than making a new one.
    """

    def __init__(
        self,
        width,
        height,
        bbox=None,
        cmap=None,
        frame_on=True,
        title="",  # Default title without customization. (?)
        figure=None,
    ):
        self.width = width
        self.height = height
        self.bbox = bbox  # xmin, xmax, ymin, ymax
        self.cmap = cmap
        if figure is None:
            self.fig, self.ax = plt.subplots(figsize=(self.width, self.height))
        else:
            self.fig, self.ax = figure
        self.texts = []
        self.ax.set_title(title)

        if frame_on:  # Remove axis ticks only.
            self.ax.tick_params(
                top=False,
                bottom=False,
                left=False,
                right=False,
                labelleft=False,
                labelbottom=False,
            )
        else:
            self.ax.axis("off")

        if bbox is not None:
            self.ax.set_xlim((self.bbox[0], self.bbox[1]))
            self.ax.set_ylim((self.bbox[2], self.bbox[3]))

    def add_layer(self, gdf, map_scale=False, map_dictionary=False, cmap=None, inform_colorbar=False, **kwargs):
        """Add a geopandas plot to a new layer.

        Parameters
        ----------
        gdf: Geopandas GeoDataFrame to plot.
        cmap: Optional matplotlib colormap object or string reference
            (e.g. "viridis").
        inform_colorbar: Set or overwrite colorbar with the current layer.
            Not applicable when `color` is supplied in the kwargs.
        map_scale: set a manual scale for the map. If missing defaults to the Remco procedure. 
        map_dictionary: set manual labels for the map. If missing defaults to the default labels.
        **kwargs: Geopandas `.plot` keyword arguments.
        """
        if "color" in kwargs:
            colormap = None
        else:
            colormap = self.cmap if cmap is None else cmap
            if inform_colorbar and "column" in kwargs:
                if hasattr(self, "cax"):
                    self.cax.remove()
                if "vmin" not in kwargs:
                    self.vmin = gdf[kwargs["column"]].min()
                else:
                    self.vmin = kwargs["vmin"]
                if "vmax" not in kwargs:
                    self.vmax = gdf[kwargs["column"]].max()
                else:
                    self.vmax = kwargs["vmax"]
        
        try: Mapper2.add_colorbar(self, colormap, min(map_scale), max(map_scale))
        except: Mapper2.add_colorbar(self, colormap, self.vmin, self.vmax)
        
        try:
            self.ax = gdf.plot(ax=self.ax, cmap=colormap, vmin=min(map_scale), vmax=max(map_scale), **kwargs)
        except: 
            self.ax = gdf.plot(ax=self.ax, cmap=colormap, **kwargs)

                
        return self
    
    def add_colorbar(
        self,
        cmap,
        vmin,
        vmax,
        location="right",
        size="5%",
        pad=0.1,
        alpha=1,
        labelsize=16,
        tickparams=None,
    ):
        """Add custom colorbar to Map.

        Needed since GeoPandas legend and plot axes do not align, see:
        https://geopandas.readthedocs.io/en/latest/docs/user_guide/mapping.html

        Parameters
        ----------
        cmap: Matplotlib colormap object or string reference (e.g. "viridis").
        vmin: Minimum value of range colorbar.
        vmax: Maximum value of range colorbar.
        location: String for location of colorbar: "top", "bottom", "left"
            or "right".
        size: Size in either string percentage or number of pixels.
        pad: Float for padding between the plot's frame and colorbar.
        alpha: Float for alpha to apply to colorbar.
        labelsize: Integer value for the text size of the ticklabels.
        tickparams: Dictionary containing value-label pairs. For example:
            {0.05: "5%", 0.1: "10%"}
        """
        norm = plt.Normalize(vmin, vmax)
        if isinstance(cmap, str):
            cmap = plt.get_cmap(cmap)
        cmap = color.force_alpha_colormap(cmap=cmap, alpha=alpha)
        scalar_to_rgba = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
        divider = make_axes_locatable(self.ax)
        self.cax = divider.append_axes(location, size, pad)
        self.cax.tick_params(labelsize=labelsize)
        tickvalues = (
            list(tickparams.keys()) if tickparams is not None else None
        )
        self.cbar = plt.colorbar(
            scalar_to_rgba, cax=self.cax, ticks=tickvalues
        )
        if tickparams is not None:
            self.cbar.set_ticklabels(list(tickparams.values()))
        return self
    
    def save(
        self, path, dpi=200, **kwargs
    ):  # Just some defaults to reduce work.
        """Save Map figure to file.
        Parameters
        ----------
        path: String path, e.g. "./example.png".
        dpi: Integer dots per inch. Increase for higher resolution figures.
        **kwargs: Matplotlib `savefig` keyword arguments.
        """
        self.fig.savefig(path, dpi=dpi, bbox_inches="tight", **kwargs)
        plt.close(self.fig)
        
def vid2date(i):
    year=str(1980 + i//12)
    month=str(i%12)
    return year+'/'+month
        
#def vid2date(i):
#    year=str(ViewsMonth(i).year)
#    month=str(ViewsMonth(i).month)
#    return year+'/'+month

#note the zip function occured earlier
standard_scale = [np.log1p(0),np.log1p(3),np.log1p(10), np.log1p(30), np.log1p(100),  np.log1p(300), np.log1p(1000), np.log1p(3000),  np.log1p(10000)]
standard_scale_labels = ['0', '3','10', '30','100', '300', '1000', '3000', '10000']

small_scale=[np.log1p(0),np.log1p(3),np.log1p(10), np.log1p(30), np.log1p(100),  np.log1p(300), np.log1p(1000)]


small_scale_labels = ['0', '3','10', '30','100', '300', '1000']

small_scale_nolabels = ['', '','', '','', '', '']

In [None]:
# Prepare the gdf
gdf_base = gpd.read_parquet('./geometry/cm_geometry.parquet')
gdf = gdf_base.copy()

In [None]:
# Future prediction maps, predictions, rolling
path = Mydropbox + 'Projects/PredictingFatalities/maps/cm_future/'
stepstoplot=[3,5,8,12,24,36]
#titles = [vid2date(i) for i in stepstoplot + EndOfHistory]


df = EnsembleList[0]['future_df_calibrated'].copy()
gdf2 = gdf_base.copy()
df = df.join(gdf2.set_index("country_id"))
gdf3 = gpd.GeoDataFrame(df, geometry="geom")

In [None]:
for step in stepstoplot:
        month = step + EndOfHistory
        gdf = gdf3.loc[month]
        m=Mapper2(
        width=10,
        height=10,
        frame_on=True,
        title='Ensemble predictions as of ' + vid2date(EndOfHistory+step) + ', ' + str(step) + ' months after last month with data',
        bbox=[-18.5, 64.0, -35.5, 43.0], 
        ).add_layer(
        gdf=gdf,
        map_scale=standard_scale,
        cmap="rainbow",
        edgecolor="black",
        linewidth=0.5,
        column='step_combined', 
        inform_colorbar=True
        )
        m.cbar.set_ticks(standard_scale)
        m.cbar.set_ticklabels(standard_scale_labels)
        if WriteToOverleaf:
            m.save(f'{overleafpath}Figures/Future/PredictionMap_cm_ensemble_standard_scale_r{EndOfHistory}_m{month}.png')
#        except:
#            print('Overleaf/dropbox folder not found')
        m.save(f'{path}PredictionMap_cm_ensemble_standard_scale_r{EndOfHistory}_m{month}.png')

# Line graphs for individual countries

In [None]:
def CumulativeFatalities(df,steps):
    ''' Calculates the sum of (real-space) fatalities in the step_combined column '''
    # Loop over all steps 
    df['exp']=np.rint(np.expm1(df['step_combined'][0:steps+1]))
    return df['exp'].sum()
    

In [None]:
from matplotlib import cm

#ModelSelection = [1,3,5,9,11]

plt.rcParams["figure.figsize"] = (6, 6)
path = Mydropbox + 'Projects/PredictingFatalities/PredictionPlots/cm_future/'

log_scale_value = np.array([np.log1p(0), np.log1p(1), np.log1p(3), np.log1p(10), np.log1p(30), np.log1p(100),np.log1p(300),np.log1p(1000),np.log1p(3000)])
log_scale_naming = ['0','1','3','10','30','100','300','1000','3000']
month_value = np.arange(EndOfHistory+1, EndOfHistory+36, 3).tolist()
month_name = []
for m in month_value:
    month_name.append(vid2date(m))

first_month = EndOfHistory

CountryList = [
    ('Algeria',67,5000),
    ('Angola',165,500),
    ('Bahrain',127,500),
    ('Benin',74,500),
    ('Botswana',154,500),
    ('Burkina Faso',47,2000),
    ('Burundi',155,2000),
    ('Cameroon',69,2000),
    ('Central African Republic',70,2000),
    ('Chad',214,2000),
    ('Congo',166,2000),
    ('Cote d\'Ivoire',41,2000),
    ('Djibouti',55,500),
    ('DRCongo',167,20000),
    ('Egypt',222,5000),
    ('Equatorial Guinea',76,2000),
    ('Eritrea',56,500),
    ('Eswatini',164,500),
    ('Ethiopia',57,2000),
    ('Gabon',169,500),
    ('The Gambia',54,500),
    ('Ghana',42,500),
    ('Guinea',48,500),
    ('Guinea-Bissau',49,500),
    ('Iran',128,2000),
    ('Iraq',60,2000),
    ('Israel',218,2000),
    ('Jordan',62,2000),
    ('Kenya',237,2000),
    ('Lebanon',94,2000),
    ('Lesotho',160,2000),
    ('Liberia',43,2000),
    ('Libya',213,5000),
    ('Madagascar',172,500),
    ('Malawi',161,2000),
    ('Mali',50,20000),
    ('Mauritania',244,500),
    ('Morocco',243,500),
    ('Mozambique',162,2000),
    ('Namibia',170,500),
    ('Niger',78,2000),
    ('Nigeria',79,20000),
    ('Oman',119,2000),
    ('Qatar',130,2000),
    ('Rwanda',156,2000),
    ('Saudi Arabia',131,500),
    ('Senegal',52,500),
    ('Sierra Leone',53,500),
    ('Somalia',120,20000),
    ('South Africa',163,2000),
    ('South Sudan',246,5000),
    ('Sudan',245,2000),
    ('Syria',220,50000),
    ('Tanzania',242,500),
    ('Togo',81,500),
    ('Turkey',96,2000),
    ('Uganda',235,2000),
    ('United Arab Emirates',132,500),
    ('Yemen',124,20000),
    ('Zambia',157,2000),
    ('Zimbabwe',158,5000),
]


df = EnsembleList[0]['future_df_calibrated'].copy()
totals36 = []
totals12 = []
totals6 = []
totals3 = []
totals = []

model = EnsembleList[-1]

print(model['modelname'])
# Calculate non-logged and cumulative series
for cnt in CountryList:
    plt.clf()
#        print(cnt)
    sc_df = EnsembleList[0]['future_df_calibrated'].xs(cnt[1],level=1)
    print(cnt[0], '3 months: ', "{:.0f}".format(CumulativeFatalities(sc_df,3)), '12 months: ', "{:.0f}".format(CumulativeFatalities(sc_df,12)), ', 36 months:' "{:.0f}".format(CumulativeFatalities(sc_df,36)))
    totals36.append({'Country': cnt[0], 'Fatalities': CumulativeFatalities(sc_df,36)})
    totals12.append({'Country': cnt[0], 'Fatalities': CumulativeFatalities(sc_df,12)})
    totals6.append({'Country': cnt[0], 'Fatalities': CumulativeFatalities(sc_df,6)})
    totals3.append({'Country': cnt[0], 'Fatalities': CumulativeFatalities(sc_df,3)})
    t = []
    t.append({'Country': cnt[0], 'Fatalities': CumulativeFatalities(sc_df,36)})
    t.append({'Country': cnt[0], 'Fatalities': CumulativeFatalities(sc_df,12)})
    t.append({'Country': cnt[0], 'Fatalities': CumulativeFatalities(sc_df,6)})
    t.append({'Country': cnt[0], 'Fatalities': CumulativeFatalities(sc_df,3)})
    totals.append(t)
    months = sc_df.index.to_series()
    sc_df_exp = sc_df.copy()
    plt.plot(months, 'step_combined', data=sc_df)
    plt.suptitle('Forecasted number of fatalities, ' + cnt[0], fontsize=16)
    plt.title('Total for 36-month period from ' + vid2date(EndOfHistory+1) + ' to ' + vid2date(EndOfHistory+36) +': ' +  "{:.0f}".format(CumulativeFatalities(sc_df,36)), fontsize=12)
    plt.ylabel('Number of fatalities')
    plt.yticks(log_scale_value, log_scale_naming, rotation=30)
    plt.xticks(month_value, month_name, rotation=30)
    plt.grid(axis='y')
    plt.ylim([0,np.log1p(3000)])
    filename = path + 'LineGraph_' + model['modelname'] + '_' + cnt[0] + '_r' + str(EndOfHistory) + '.png'
    plt.savefig(filename, dpi=300)
    if cnt[0] == 'Ethiopia' or cnt[0] == 'Nigeria' or cnt[0] == 'DRCongo' or cnt[0] == 'Somalia' or cnt[0] == 'Yemen' or cnt[0] == 'Syria':
        if WriteToOverleaf:
            filename = overleafpath + 'Figures/Future/LineGraph_' + model['modelname'] + '_' + cnt[0] + '_r' + str(EndOfHistory) + '.png'
            plt.savefig(filename, dpi=300)
        
# Collecting five top countries in one graph
plt.clf()
for cnt in CountryList:
    if cnt[0] == 'Nigeria' or cnt[0] == 'DRCongo' or cnt[0] == 'Somalia' or cnt[0] == 'Yemen' or cnt[0] == 'Syria':
        sc_df = EnsembleList[0]['future_df_calibrated'].xs(cnt[1],level=1)
        months = sc_df.index.to_series()
        sc_df_exp = sc_df.copy()
        plt.plot(months, 'step_combined', data=sc_df, label = cnt[0])
plt.suptitle('Forecasted number of fatalities, five most violent countries', fontsize=16)
plt.ylabel('Number of fatalities')
plt.yticks(log_scale_value, log_scale_naming, rotation=30)
plt.xticks(month_value, month_name, rotation=30)
plt.grid(axis='y')
plt.ylim([0,np.log1p(3000)])
plt.legend()
filename = path + 'LineGraph_' + model['modelname'] + '_' + 'Top5' + '_r' + str(EndOfHistory) + '.png'
plt.savefig(filename, dpi=300)
if WriteToOverleaf:
    filename = overleafpath + 'Figures/Future/LineGraph_' + model['modelname'] + '_' + 'Top5' + '_r' + str(EndOfHistory) + '.png'
    plt.savefig(filename, dpi=300)

totals36_df = pd.DataFrame(totals36)
totals36_df['Fatalities'] = totals36_df['Fatalities'].astype(int)
totals12_df = pd.DataFrame(totals12)
totals12_df['Fatalities'] = totals12_df['Fatalities'].astype(int)
totals6_df = pd.DataFrame(totals6)
totals6_df['Fatalities'] = totals6_df['Fatalities'].astype(int)
totals3_df = pd.DataFrame(totals3)
totals3_df['Fatalities'] = totals3_df['Fatalities'].astype(int)
totals_df = pd.DataFrame(totals)
#totals12_df['Fatalities'] = totals12_df['Fatalities'].astype(int)

In [None]:
# Pie chart for total predicted fatalities over next 36 months
MinimumFatalities = [100,300]

totals3_subset = totals3_df.loc[(totals3_df['Fatalities'] >= MinimumFatalities[0])]
labels3 = totals3_subset['Country']
sizes3 = totals3_subset['Fatalities']


totals6_subset = totals6_df.loc[(totals6_df['Fatalities'] >= MinimumFatalities[0])]
labels6 = totals6_subset['Country']
sizes6 = totals6_subset['Fatalities']


totals12_subset = totals12_df.loc[(totals12_df['Fatalities'] >= MinimumFatalities[0])]
labels12 = totals12_subset['Country']
sizes12 = totals12_subset['Fatalities']

totals36_subset = totals36_df.loc[(totals36_df['Fatalities'] >= MinimumFatalities[0])]
labels36 = totals36_subset['Country']
sizes36 = totals36_subset['Fatalities']

# Figure for 12 and 36 months into the future

fig1,(ax1,ax2) = plt.subplots(1,2,figsize=(15,15))

ax1.pie(sizes12, labels=labels12, autopct='%1.1f%%', shadow=False, startangle=110, radius=5000)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

ax2.pie(sizes36, labels=labels36, autopct='%1.1f%%', shadow=False, startangle=110)
ax2.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.


plt.title('Distribution of predicted fatalities, countries with more than ' + str(MinimumFatalities) + ' deaths')

filename = path + 'TotalsPie_12_36_' + model['modelname'] + '_' + str(EndOfHistory) + '.png'
plt.savefig(filename, dpi=300)
if WriteToOverleaf:
    filename = overleafpath + 'Figures/Future/TotalsPie_12_36_' + model['modelname'] + '_' + str(EndOfHistory) + '.png'
    plt.savefig(filename, dpi=300)

# Figure for 3 and 6 months
fig2,(ax1,ax2) = plt.subplots(1,2,figsize=(15,15))

ax1.pie(sizes3, labels=labels3, autopct='%1.1f%%', shadow=False, startangle=110, radius=5000)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

ax2.pie(sizes6, labels=labels6, autopct='%1.1f%%', shadow=False, startangle=110)
ax2.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.


plt.title('Distribution of predicted fatalities, countries with more than ' + str(MinimumFatalities) + ' deaths')

filename = path + 'TotalsPie_3_6_' + model['modelname'] + '_' + str(EndOfHistory) + '.png'
plt.savefig(filename, dpi=300)
if WriteToOverleaf:
    filename = overleafpath + 'Figures/Future/TotalsPie_3_6_' + model['modelname'] + '_' + str(EndOfHistory) + '.png'
    plt.savefig(filename, dpi=300)

In [None]:
# Bar chart, fatalities next 3/6/12/36 months
labels3 = totals3_df['Country']
sizes3 = totals3_df['Fatalities']

labels6 = totals6_df['Country']
sizes6 = totals6_df['Fatalities']

labels12 = totals12_df['Country']
sizes12 = totals12_df['Fatalities']

labels36 = totals36_df['Country']
sizes36 = totals36_df['Fatalities']

# Figure for 12, 36 months into the future

fig1,(ax1,ax2) = plt.subplots(1,2,figsize=(15,15))

ax1.barh(width=sizes12, y=labels12)
ax1.invert_yaxis()  # labels read top-to-bottom
ax1.grid(axis = 'x')
ax2.barh(width=sizes36, y=labels36)
ax2.invert_yaxis()  # labels read top-to-bottom
ax2.grid(axis = 'x')

ax1.set_title('Total predicted fatalities, next 12 months')
ax2.set_title('Total predicted fatalities, next 36 months')

#plt.title('Predicted fatalities')

filename = path + 'TotalsBar_12_36_' + model['modelname'] + '_' + str(EndOfHistory) + '.png'
plt.savefig(filename, dpi=300)
if WriteToOverleaf:
    filename = overleafpath + 'Figures/Future/TotalsBar_12_36_' + model['modelname'] + '_' + str(EndOfHistory) + '.png'
    plt.savefig(filename, dpi=300)

# Figure for 3, 6 months into the future

fig1,(ax1,ax2) = plt.subplots(1,2,figsize=(15,15))

ax1.barh(width=sizes3, y=labels3)
ax1.invert_yaxis()  # labels read top-to-bottom
ax1.grid(axis = 'x')
ax2.barh(width=sizes6, y=labels6)
ax2.invert_yaxis()  # labels read top-to-bottom
ax2.grid(axis = 'x')

ax1.set_title('Total predicted fatalities, next 3 months')
ax2.set_title('Total predicted fatalities, next 6 months')

#plt.title('Predicted fatalities')

filename = path + 'TotalsBar_3_6_' + model['modelname'] + '_' + str(EndOfHistory) + '.png'
plt.savefig(filename, dpi=300)
if WriteToOverleaf:
    filename = overleafpath + 'Figures/Future/TotalsBar_3_6_' + model['modelname'] + '_' + str(EndOfHistory) + '.png'
    plt.savefig(filename, dpi=300)

## Retrain the surrogate models

In [None]:
Datasets[10]['df']['vdem_v2x_libdem'].loc[544]

In [None]:
from cm_surrogatemodels import TrainSurrogateModels
SurrogateModelSteps = [1,3,6,36]
SurrogateModelSteps = steps
EndOfHistory_test = test_partitioner_dict['train'][1] 
Plotpath = Mydropbox + 'Projects/PredictingFatalities/SurrogateModels/'


       
SurrogateModelList = TrainSurrogateModels(data_df = Datasets[10]['df'], 
                                          Ensemble_df = ensemble_test_df, 
                                          EndOfHistory = EndOfHistory_test, 
                                          SurrogateModelSteps = SurrogateModelSteps, 
                                          NumberOfMonths = 48,
                                          Plotpath = Plotpath)

In [None]:
predictors_df = Datasets[10]['df'].loc[EndOfHistory]

EnsembleList[0]['future_df_surrogates'] = EnsembleList[0]['future_df_calibrated'].copy()
# Initialize dataframe to hold surrogate model predictions:
for item in SurrogateModelList:
    if item['Step'] == 1:
        colname = item['Modelname'][item['Modelname'].index(' ') + 1:] # Remove first word (which is a step number)
        EnsembleList[0]['future_df_surrogates'][colname] = np.nan  
# Compute predictions for each step
for step in steps:
    month = EndOfHistory + step
#    print('Step',step,'Month',month)
    for item in SurrogateModelList:
        colname = item['Modelname'][item['Modelname'].index(' ') + 1:] # Remove first word (which is a step number)
        if item['Step']==step:
#            print('colname:',colname,'Step:',item['Step'], item['Columns'])
            EnsembleList[0]['future_df_surrogates'][colname].loc[month] = item['GAM'].predict(predictors_df[item['Columns']])

# Storing the surrogate model future predictions
for item in SurrogateModelList:
    if item['Step'] == 36:
        colname = item['Modelname'][item['Modelname'].index(' ') + 1:] # Remove first word (which is a step number)
        predstore_future = level +  '_surrogate_' + item['Shortname'] + '_f' + str(EndOfHistory)
        print('Storing surrogate model predictions for model',colname, 'as:',predstore_future)
        predictions_to_store = pd.DataFrame(EnsembleList[0]['future_df_surrogates'][colname])
        predictions_to_store.forecasts.set_run(run_id)
        predictions_to_store.forecasts.to_store(name=predstore_future, overwrite = True) 



In [None]:
# Mapping

predictors_df = Datasets[10]['df'].loc[EndOfHistory]
predictors_df_3m = Datasets[10]['df'].loc[EndOfHistory-3]

path = Mydropbox + 'Projects/PredictingFatalities/maps/cm_future/Surrogate/'
surrogate_scale=[np.log1p(0),np.log1p(3),np.log1p(10), np.log1p(30), np.log1p(100), np.log1p(300)]

surrogate_scale_labels = ['', '','', '', '', '']

MapSteps = [1,3,6,12,36]
for model in SurrogateModelList:
    if model['Step'] in MapSteps:
        print(model['Modelname'], model['Columns'])

        df = predictors_df[model['Columns']]
        df[model['Predcolname']] = model['GAM'].predict(predictors_df[model['Columns']])
        gdf2 = gdf_base.copy()
        df = df.join(gdf2.set_index("country_id"))
        gdf3 = gpd.GeoDataFrame(df, geometry="geom")
        Predcolname = model['Predcolname']
        step = model['Step']
        TargetMonth = EndOfHistory+step

        m=Mapper2(
        width=10,
        height=10,
        frame_on=True,
        title='Surrogate model ' + model['Modelname'] + ' predictions as of ' + vid2date(TargetMonth) + ', ' + str(step) + ' months after last month with data',
        bbox=[-18.5, 64.0, -35.5, 43.0], 
        ).add_layer(
        gdf=gdf3,
        map_scale=surrogate_scale,
        cmap="rainbow",
        edgecolor="black",
        linewidth=0.5,
        column=model['Predcolname'], 
        inform_colorbar=True
        )
        m.cbar.set_ticks(surrogate_scale)
        m.cbar.set_ticklabels(surrogate_scale_labels)

        m.save(f'{path}cm_surrogate_{Predcolname}_small_scale_{EndOfHistory}_{TargetMonth}.png')
        if WriteToOverleaf:
            m.save(f'{overleafpath}Figures/Future/cm_surrogate_{Predcolname}_small_scale_{EndOfHistory}_{TargetMonth}.png')


# Changes to 3- and 6-month forecasts, and since last actual observation

In [None]:
# Reading in data for mapping
# Predictions now and then
predstore_then = level +  '_' + EnsembleList[0]['modelname'] + '_f' + str(EndOfHistory-3)

df_now = EnsembleList[0]['future_df_calibrated'].copy()
try:
    df_then = pd.DataFrame.forecasts.read_store(run=run_id, name=predstore_then)
except:
    print('Trouble reading forecasts issued three months ago')
    
# Actuals
qs = (Queryset("hh_fatalities_ged_ln_ultrashort", "country_month"))
df_lastobserved = qs.fetch().astype(float)

In [None]:
# Compute log of mean non-logged fatalities, past six months
df_observed = df_lastobserved.loc[EndOfHistory]
df_observed['ged_sb_0'] = np.expm1(df_observed['ln_ged_sb'])
df_observed['ged_sum'] = df_observed['ged_sb_0']
for t in [1,2,3,4,5]:
    colname = 'ged_sb_' + str(t)
    df_observed[colname] = np.expm1(df_lastobserved.loc[EndOfHistory-t]['ln_ged_sb'])
    df_observed['ged_sum'] = df_observed['ged_sum'] + df_observed[colname]
df_observed['ln_ged_sum'] = np.log1p(df_observed['ged_sum']/6)
#df_observed.tail(20)

In [None]:
StepsForward = [
{
    'Step': 3,
    'df_now': df_now.loc[EndOfHistory + 3],
    'df_then': df_then.loc[EndOfHistory - 3 + 3]
},
{
    'Step': 6,
    'df_now': df_now.loc[EndOfHistory + 6],
    'df_then': df_then.loc[EndOfHistory - 3 + 6]
},
    {
    'Step': 12,
    'df_now': df_now.loc[EndOfHistory + 12],
    'df_then': df_then.loc[EndOfHistory - 3 + 12]
},
    {
    'Step': 36,
    'df_now': df_now.loc[EndOfHistory + 36],
    'df_then': df_then.loc[EndOfHistory - 3 + 36]
},
]
engine = sa.create_engine(source_db_path)
#predictors_df = data_vdem_short.loc[EndOfHistory]
#predictors_df_3m = data_vdem_short.loc[EndOfHistory-3]

for s in StepsForward:
    s['df_now'].rename(columns={'step_combined':'Now'}, inplace=True)
    s['df_then'].rename(columns={'step_combined':'Then'}, inplace=True)
    s['df'] = pd.concat([s['df_now'],s['df_then'],df_observed['ln_ged_sum']],axis=1)
    s['df']['Change_in_prediction'] = s['df']['Now']-s['df']['Then']
    s['df']['Change_since_last_observed'] = s['df']['Now']-s['df']['ln_ged_sum']
    
    # Surrogate model change
    for sm in SurrogateModelList:
        if sm['Step'] == s['Step']:
            s['sdf'] = predictors_df[sm['Columns']]
            s['sdf'][sm['Predcolname']] = sm['GAM'].predict(predictors_df[sm['Columns']])
            s['sdf_3m'] = predictors_df_3m[sm['Columns']]
            s['sdf_3m'][sm['Predcolname']] = sm['GAM'].predict(predictors_df_3m[sm['Columns']])
            print(sm['Step'],sm['Predcolname'])
            dfcolname = sm['Predcolname'][:-2] + '_ch3m'
            s['df'][dfcolname] = s['sdf'][sm['Predcolname']] - s['sdf_3m'][sm['Predcolname']]
    
    s['gdf'] = gpd.GeoDataFrame.from_postgis(
        "SELECT id as country_id, in_africa, in_me, geom FROM prod.country", 
        engine, 
        geom_col='geom'
    )
    s['gdf'] = s['gdf'].to_crs(4326)

    s['gdf_t'] = s['df'].join(s['gdf'].set_index("country_id"))
    s['gdf'] = gpd.GeoDataFrame(s['gdf_t'], geometry="geom")
    
    
StepsForward[1]['gdf'].describe()

In [None]:
delta = 3

tickvalues=np.array([-300,-30,-3,3,30,300])
ticklabels=[str(tv) for tv in tickvalues]

tickvalues=np.sign(tickvalues)*np.log1p(np.abs(tickvalues)+1)
#print(tickvalues)
tickvalues = np.array([-83,-80,-50,-20,0,20,50,100,200,500])
ticklabels=[str(tv) for tv in tickvalues]
ticklabels[0] = ""
tickvalues = np.log((100+tickvalues)/100)


t0s=range(506,508) # From start of month A, to start of (but not including) month B
bbox="africa_middle_east"
cmap='bwr'#'rainbow'
ColumnsToPlot = ['Change_in_prediction',
                 'Change_since_last_observed',
                 's_pred_mCH_ch3m',
                 's_pred_mNCH_ch3m',
                 's_pred_mDem_ch3m',
                 's_pred_mIMR_ch3m',
                # 's_pred_mTopics10_ch3m',
                ]


for s in StepsForward:
    print('Step:',s['Step'])
    for column in ColumnsToPlot:
        titlestring=''
        plot = ViewsMap(
            width=10,
            label=f"{column}, s= {s['Step']}",
            title="",
            scale=None,
            bbox=bbox
        ).add_layer(
            s['gdf'],
            edgecolor="black",
            linewidth=0.2,
            column=column,
        inform_colorbar=True,
        cmap=cmap,
        vmin=tickvalues[0],vmax=tickvalues[-1]
    )

        ax=plot.ax
        fg=s['gdf'].plot(ax=ax,edgecolor='black',linewidth=0.2,facecolor='None')
       # fg=gdf_c.plot(ax=ax,edgecolor='gray',linewidth=1.0,facecolor='None')
        figure=plot.fig
        fontdict={'fontsize':20}
        fig=plot.fig

        plot.cbar.set_ticks(tickvalues)
        plot.cbar.set_ticklabels(ticklabels)
        if abs(delta)==1:
            mnth='month'
        else:
            mnth='months'
        plot.cbar.set_label(f'Percent change in {column} over past '+str(delta)+' '+mnth)
        plot.save(path+column+str(s['Step'])+'_r' + str(EndOfHistory) +'.png')
        if WriteToOverleaf:
            plot.save(overleafpath+'Figures/Future/'+column+str(s['Step'])+'_r' + str(EndOfHistory) +'.png')

# Uncertainty of predictions

In [None]:
# Train model to transform predictions from  fatalities to multiclass probabilities
from sklearn.linear_model import LogisticRegression
multi_classifiers = []
df_future = EnsembleList[0]['future_df_calibrated'].copy()
for cls in [0,1,2,3,4]:
    df_future[f'multi_{cls}_logit'] = np.nan

for step in steps:
    Month = EndOfHistory + step
    X = np.array(ensemble_test_df[f'step_pred_{step}'])
    X = X.reshape(-1,1)
    # Multiclass
    y_multi = np.array(ensemble_test_df['ged_multi']).reshape(-1, 1)
    multi_clf = LogisticRegression(random_state=0).fit(X, y_multi)
    multi_classifiers.append(multi_clf)
    X_future = np.array(df_future['step_combined'].loc[Month]).reshape(-1,1)
    p_multi = multi_clf.predict_proba(X_future)
    for cls in [0,1,2,3,4]:
        df_future[f'multi_{cls}_logit'].loc[Month] = p_multi[:,cls]

        
df_future.describe()

In [None]:
df_future.to_csv('Categorical_probabilities.csv')