
# ViEWS 3 constituent models 
## ViEWS production system, cm level


This notebook trains a set of regression models for use in the monthly updated ViEWS predicting fatalities ensemble

The notebook does the following: 
1. Retrieves data through querysets and stores in DataSets, a list of dictionaries
2. Specifies the metadata of a number of models, stores in ModelList, a list of dictionaries
3. Trains the models in ModelList, stores the trained objects in model storage and prediction storage
4. Saves part of ModelList as csv and the rest as pickles


## Importing modules

In [1]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from xgboost import XGBRFRegressor, XGBRFClassifier

from lightgbm import LGBMClassifier, LGBMRegressor

# Views 3
from viewser.operations import fetch
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
import views_dataviz
from views_runs import storage
from views_runs.storage import store, retrieve, fetch_metadata

from views_forecasts.extensions import *

# Other packages
import pickle as pkl

# Packages from Predicting Fatalies repository

#from HurdleRegression import * # Built on script from Geoff Hurdock: https://geoffruddock.com/building-a-hurdle-regression-estimator-in-scikit-learn/
#from Ensembling import CalibratePredictions, RetrieveStoredPredictions, mean_sd_calibrated, gam_calibrated
from FetchData import FetchData, RetrieveFromList
from ViewsEstimators import *


  from pandas import MultiIndex, Int64Index


## Common parameters

In [2]:
#!conda list | grep views

In [3]:
# To do:
# find out why and where missingness occurs

In [4]:
# Common parameters:
dev_id = 'Fatalities002'
run_id = 'Fatalities002'

# Generating a new run if necessary

#try:
#    ViewsMetadata().new_run(name=run_id,description='Developing the fatalities model for FCDO',min_month=1,max_month=999)
#except KeyError:
#    if 'devel' not in run_id:
#        warnings.warn('You are overwriting a production system')

RerunQuerysets = True

FutureStart = 508
steps = [*range(1, 36+1, 1)] # Which steps to train and predict for
fi_steps = [1,3,6,12,36] # Which steps to present feature importances for
#steps = [1,3,6,12,36]
#fi_steps = [1,3,6,12,36]

# Specifying partitions
calib_partitioner_dict = {"train":(121,396),"predict":(397,444)}
test_partitioner_dict = {"train":(121,444),"predict":(445,492)}
future_partitioner_dict = {"train":(121,492),"predict":(493,504)}
calib_partitioner =  views_runs.DataPartitioner({"calib":calib_partitioner_dict})
test_partitioner =  views_runs.DataPartitioner({"test":test_partitioner_dict})
future_partitioner =  views_runs.DataPartitioner({"future":future_partitioner_dict})

Mydropbox = '/Users/havardhegre/Dropbox (ViEWS)/ViEWS'

# Retrieve data

In [5]:
# Specifying querysets
# Rerun if querysets have 
if RerunQuerysets:
    import cm_querysets

 .    hh_fat_cm_ged_ln_ultrashort; A dataset with 6 columns, with data between t 1 and 852. (213 units)
 .    hh_fatalities_ged_acled_ln; A dataset with 58 columns, with data between t 1 and 852. (213 units)
 .    fat_cm_conflict_history; A dataset with 29 columns, with data between t 1 and 852. (213 units)
 .    hh_fatalities_vdem; A dataset with 111 columns, with data between t 1 and 852. (213 units)
 .    hh_fatalities_vdem_short; A dataset with 63 columns, with data between t 1 and 852. (213 units)
 .    hh_fatalities_wdi;A dataset with 54 columns, with data between t 1 and 852. (213 units)
 .    hh_fatalities_wdi_short; A dataset with 32 columns, with data between t 1 and 852. (213 units)
 .    hh_topic_model; A dataset with 64 columns, with data between t 1 and 852. (213 units)
 .    hh_topic_model_short; A dataset with 39 columns, with data between t 1 and 852. (213 units)
 .    hh_all_features; A dataset with 181 columns, with data between t 1 and 852. (213 units)
 .    hh_broa

In [6]:
# Fetch data from the querysets
Datasets = FetchData(dev_id)

Fetching data using querysets; Fatalities002; returns as list of dictionaries containing datasets
 .    baseline: A dataset with 6 columns, with data between t = 1 and 852; 213 units.
 .    conflictlong_ln: A dataset with 58 columns, with data between t = 1 and 852; 213 units.
 .    conflict_ln: A dataset with 29 columns, with data between t = 1 and 852; 213 units.
 .    conflict_nolog: A dataset with 29 columns, with data between t = 1 and 852; 213 units.
 .    wdi_short: A dataset with 32 columns, with data between t = 1 and 852; 213 units.
 .    vdem_short: A dataset with 63 columns, with data between t = 1 and 852; 213 units.
 .    topics_short: A dataset with 39 columns, with data between t = 1 and 852; 213 units.
 .    broad: A dataset with 110 columns, with data between t = 1 and 852; 213 units.
 .    gh: A dataset with 57 columns, with data between t = 1 and 852; 213 units.
 .    hh20: A dataset with 39 columns, with data between t = 1 and 852; 213 units.
 .    all_features: A 

# Generating predictions
Using the new partitioning/stepshifting syntax. Training models for A: calibration partition and B: test partition, to test out some calibration routines. Most models trained with ln_ged_sb_best as outcome, but also one model with ged_sb_best to see whether that improves calibration on its own.

In [7]:
dev_id

'Fatalities002'

In [8]:
Datasets[0]['df'].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ln_ged_sb_dep,ln_ged_sb,wdi_sp_pop_totl,decay_ged_sb_5,decay_ged_os_5,splag_1_decay_ged_sb_5
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,0.0,0.0,780153.0,0.0,0.0,0.0
1,2,0.0,0.0,359531.0,0.0,0.0,0.0
1,3,0.0,0.0,1084744.0,0.0,0.0,0.0
1,4,0.0,0.0,15182611.0,0.0,0.0,0.0
1,5,0.0,0.0,155525.0,0.0,0.0,0.0


In [9]:
from views_runs import ModelMetadata 
help(ModelMetadata)

Help on class ModelMetadata in module views_schema.models:

class ModelMetadata(pydantic.main.BaseModel)
 |  ModelMetadata(*, author: str, queryset_name: str, train_start: int, train_end: int, steps: List[int] = None, training_date: datetime.datetime) -> None
 |  
 |  ModelMetadata
 |  
 |  Data used to organize model objects.
 |  
 |  parameters:
 |      author (str): Name of the user that authored the model object.
 |      queryset_name (str): Name of the queryset used to train the model
 |      train_start (int): Month identifier for training start date
 |      train_start (int): Month identifier for training end date
 |      training_date (datetime.datetime): Timestamp for training date (use datetime.datetime.now())
 |  
 |  example:
 |  
 |      # Instantiate the class with values
 |  
 |      my_metadata = ModelMetadata(
 |          author = "my_name",
 |          queryset_name = "my_queryset",
 |          train_start = 1,
 |          train_end = 300,
 |          steps = [1,2,3],

## Checking missingness and infinity values

In [10]:
N=51
df = Datasets[0]['df']
for col in df.iloc[: , :N].columns:
    print(col,len(df[col]), 'missing:', df[col].isnull().sum(), 'infinity:', np.isinf(df).values.sum())


ln_ged_sb_dep 158230 missing: 0 infinity: 0
ln_ged_sb 158230 missing: 0 infinity: 0
wdi_sp_pop_totl 158230 missing: 11 infinity: 0
decay_ged_sb_5 158230 missing: 0 infinity: 0
decay_ged_os_5 158230 missing: 0 infinity: 0
splag_1_decay_ged_sb_5 158230 missing: 0 infinity: 0


## Identify early stopping parameter

See the Early_stopping_experiment notebook for how we arrived at the early stopping parameters for the XGBoost models.

# Specify models in ensemble

In [11]:
# The ModelList is a list of dictionaries that define a range of models for the project

ModelList = []
nj=12

model = {
    'modelname':     'fat_baseline_rf',
    'algorithm':     XGBRFRegressor(n_estimators=300,n_jobs=nj),
    'depvar':        "ln_ged_sb_dep",
    'data_train':    'baseline',
    'queryset':      'hh_fatalities_ged_ln_ultrashort',
    'preprocessing': 'float_it',
}
ModelList.append(model)

model = {
    'modelname': 'fat_conflicthistory_rf',
    'algorithm': XGBRFRegressor(n_estimators=300,n_jobs=nj),
    'depvar': "ln_ged_sb_dep",
    'data_train':    'conflict_ln',
    'queryset': "fat_cm_conflict_history",
    'preprocessing': 'float_it',
}
ModelList.append(model)

# Model: GED logged dependent variable, logged conflict history variables, gradient boosting
model = {
    'modelname': 'fat_conflicthistory_gbm',
    'algorithm': GradientBoostingRegressor(), 
    'depvar': "ln_ged_sb_dep",
    'data_train':    'conflict_ln',
    'queryset': "fat_cm_conflict_history",
    'preprocessing': 'float_it',
}
ModelList.append(model)       
    

model = {
    'modelname': 'fat_conflicthistory_hurdle_lgb',
    'algorithm': HurdleRegression(clf_name = 'LGBMClassifier', reg_name = 'LGBMRegressor'),
    'depvar': "ln_ged_sb_dep",
    'data_train':    'conflict_ln',
    'queryset': "fat_cm_conflict_history",
    'preprocessing': 'float_it',
}
ModelList.append(model)

model = {
    'modelname': 'fat_conflicthistory_long_xgb',
    'algorithm': XGBRegressor(n_estimators=100,learning_rate=0.05,n_jobs=nj),
    'depvar': "ln_ged_sb_dep",
    'data_train':    'conflictlong_ln',
    'queryset': "hh_fatalities_ged_acled_ln",
    'preprocessing': 'float_it',
}
ModelList.append(model)

model = {
    'modelname':  'fat_vdem_hurdle_xgb',
    'algorithm': HurdleRegression(clf_name = 'XGBClassifier', reg_name = 'XGBRegressor'),
    'depvar': "ln_ged_sb_dep",
    'data_train':    'vdem_short',
    'queryset':  "hh_fatalities_vdem_short",
    'preprocessing': 'float_it',
}
ModelList.append(model)

model = {
    'modelname':  'fat_wdi_rf',
    'algorithm': XGBRFRegressor(n_estimators=300,n_jobs=nj),
    'depvar': "ln_ged_sb_dep",
    'data_train':    'wdi_short',
    'queryset':  "hh_fatalities_wdi_short",
    'preprocessing': 'float_it',
}
ModelList.append(model)

model = {
    'modelname':  'fat_topics_rf',
    'algorithm': XGBRFRegressor(n_estimators=300,n_jobs=nj),
    'depvar': "ln_ged_sb_dep",
    'data_train':    'topics_short',
    'queryset':   "hh_topic_model_short",
    'preprocessing': 'float_it',
}
ModelList.append(model)

model = {
    'modelname':  'fat_topics_histgbm',
    'algorithm': HistGradientBoostingRegressor(max_iter=200),
    'depvar': "ln_ged_sb_dep",
    'data_train':    'topics_short',
    'queryset':   "hh_topic_model_short",
    'preprocessing': 'float_it',
}
ModelList.append(model)


model = {
    'modelname':  'fat_broad_xgb',
    'algorithm':  XGBRFRegressor(n_estimators=300,n_jobs=nj),
    'depvar':     "ln_ged_sb_dep",
    'data_train':    'broad',
    'queryset':   'hh_broad',
    'preprocessing': 'float_it',
}
ModelList.append(model)


model = {
    'modelname':  'fatalities002_greatest_hits_hurdle_xgb',
    'algorithm':  HurdleRegression(clf_name = 'XGBClassifier', reg_name = 'XGBRegressor'),
    'depvar':     "ln_ged_sb_dep",
    'data_train':    'gh',
    'queryset':   'fatalities002_greatest_hits',
    'preprocessing': 'float_it',
}
ModelList.append(model)

model = {
    'modelname': 'fat_hh20_hurdle_rf',
    'algorithm': HurdleRegression(clf_name = 'RFClassifier', reg_name = 'RFRegressor'),
    'depvar': "ln_ged_sb_dep",
    'data_train':    'hh20',
    'queryset': 'hh_20_features',
    'preprocessing': 'float_it',
}
ModelList.append(model)

model = {
    'modelname': 'fat_hh20_hurdle_xgb',
    'algorithm': HurdleRegression(clf_name = 'XGBClassifier', reg_name = 'XGBRegressor'),
    'depvar': "ln_ged_sb_dep",
    'data_train':    'hh20',
    'queryset': 'hh_20_features',
    'preprocessing': 'float_it',
}
ModelList.append(model)


model = {
    'modelname': 'fat_hh20_hurdle_lgb',
    'algorithm': HurdleRegression(clf_name = 'LGBMClassifier', reg_name = 'LGBMRegressor'),
    'depvar': "ln_ged_sb_dep",
    'data_train':    'hh20',
    'queryset': 'hh_20_features',
    'preprocessing': 'float_it',
}
ModelList.append(model)


# PCA models: need to implement a PCA preprocessing function first.
model = {
    'modelname':      'fat_all_pca3_xgb',
    'algorithm':      XGBRegressor(n_estimators=100,learning_rate=0.05,n_jobs=nj),
    'depvar':         "ln_ged_sb_dep",
    'data_train':     'pca_all',
    'queryset':      'hh_all_features',
    'preprocessing': 'pca_it',
}
ModelList.append(model)


model = {
    'modelname':     'fatalities002_aquastat_rf',
    'algorithm':     XGBRFRegressor(n_estimators=300,n_jobs=nj),
    'depvar':        "ln_ged_sb_dep",
    'data_train':    'aquastat',
    'queryset':      'Fatalities002_aquastat',
    'preprocessing': 'float_it',
}
ModelList.append(model)

model = {
    'modelname':     'fatalities002_faostat_rf',
    'algorithm':     XGBRFRegressor(n_estimators=300,n_jobs=nj),
    'depvar':        "ln_ged_sb_dep",
    'data_train':    'faostat',
    'queryset':      'Fatalities002_faostat',
    'preprocessing': 'float_it',
}
ModelList.append(model)

model = {
    'modelname':     'fatalities002_faoprices_rf',
    'algorithm':     XGBRFRegressor(n_estimators=300,n_jobs=nj),
    'depvar':        "ln_ged_sb_dep",
    'data_train':    'faoprices',
    'queryset':      'Fatalities002_faoprices',
    'preprocessing': 'float_it',
}
ModelList.append(model)

model = {
    'modelname':     'fatalities002_imfweo_rf',
    'algorithm':     XGBRFRegressor(n_estimators=300,n_jobs=nj),
    'depvar':        "ln_ged_sb_dep",
    'data_train':    'imfweo',
    'queryset':      'Fatalities001_imfweo',
    'preprocessing': 'float_it',
}
ModelList.append(model)


i = 0
for model in ModelList:
    print(i, model['modelname'], model['data_train'])
    i = i + 1

0 fat_baseline_rf baseline
1 fat_conflicthistory_rf conflict_ln
2 fat_conflicthistory_gbm conflict_ln
3 fat_conflicthistory_hurdle_lgb conflict_ln
4 fat_conflicthistory_long_xgb conflictlong_ln
5 fat_vdem_hurdle_xgb vdem_short
6 fat_wdi_rf wdi_short
7 fat_topics_rf topics_short
8 fat_topics_histgbm topics_short
9 fat_broad_xgb broad
10 fatalities002_greatest_hits_hurdle_xgb gh
11 fat_hh20_hurdle_rf hh20
12 fat_hh20_hurdle_xgb hh20
13 fat_hh20_hurdle_lgb hh20
14 fat_all_pca3_xgb pca_all
15 fatalities002_aquastat_rf aquastat
16 fatalities002_faostat_rf faostat
17 fatalities002_faoprices_rf faoprices
18 fatalities002_imfweo_rf imfweo


In [12]:
# Loop that checks whether the model exists, retrains if not, 
# and stores the predictions if they have not been stored before for this run.
# To do: set the data_preprocessing to the function in the model dictionary

level = 'cm'
includeFuture = False

from views_runs import Storage, StepshiftedModels
from views_partitioning.data_partitioner import DataPartitioner
from viewser import Queryset, Column
from views_runs import operations
from views_runs.run_result import RunResult

i = 0
for model in ModelList:
    force_retrain = False
    modelstore = storage.Storage()
    ct = datetime.now()
    print(i, model['modelname'])
    print('Calibration partition', ct)
    model['Algorithm_text'] = str(model['algorithm'])
    model['RunResult_calib'] = RunResult.retrain_or_retrieve(
            retrain            = force_retrain,
            store              = modelstore,
            partitioner        = DataPartitioner({"calib":calib_partitioner_dict}),
            stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
            dataset            = RetrieveFromList(Datasets,model['data_train']),
            queryset_name      = model['queryset'],
            partition_name     = "calib",
            timespan_name      = "train",
            storage_name       = model['modelname'] + '_calib',
            author_name        = "HH",
    )

    model['predstore_calib'] = level +  '_' + model['modelname'] + '_calib'
    ct = datetime.now()
    print('Trying to retrieve predictions', ct)
    try:
        predictions_calib = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_calib'])
    except KeyError:
        print(model['predstore_calib'], ', run',  run_id, 'does not exist, predicting')
        predictions_calib = model['RunResult_calib'].run.predict("calib","predict", model['RunResult_calib'].data)
        predictions_calib.forecasts.set_run(run_id)
        predictions_calib.forecasts.to_store(name=model['predstore_calib'])

    ct = datetime.now()
    print('Test partition', ct)
    modelstore = storage.Storage()
    model['RunResult_test'] = RunResult.retrain_or_retrieve(
            retrain            = force_retrain,
            store              = modelstore,
            partitioner        = DataPartitioner({"test":test_partitioner_dict}),
            stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
            dataset            = RetrieveFromList(Datasets,model['data_train']),
            queryset_name      = model['queryset'],
            partition_name     = "test",
            timespan_name      = "train",
            storage_name       = model['modelname'] + '_test',
            author_name        = "HH",
    )
    ct = datetime.now()
    print('Trying to retrieve predictions', ct)
    model['predstore_test'] = level +  '_' + model['modelname'] + '_test'
    try:
        predictions_test = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_test'])
    except KeyError:
        print(model['predstore_test'], ', run', run_id, 'does not exist, predicting')
        predictions_test = model['RunResult_test'].run.predict("test","predict",model['RunResult_test'].data)
        predictions_test.forecasts.set_run(run_id)
        predictions_test.forecasts.to_store(name=model['predstore_test'])
    # Predictions for true future
    if includeFuture:
        ct = datetime.now()
        print('Future', ct)
        modelstore = storage.Storage()
        model['RunResult_future'] = RunResult.retrain_or_retrieve(
                retrain            = force_retrain,
                store              = modelstore,
                partitioner        = DataPartitioner({"test":future_partitioner_dict}),
                stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                dataset            = RetrieveFromList(Datasets,model['data_train']),
                queryset_name      = model['queryset'],
                partition_name     = "test",
                timespan_name      = "train",
                storage_name       = model['modelname'] + '_future',
                author_name        = "HH",
        )
        ct = datetime.now()
        print('Trying to retrieve predictions', ct)
        model['predstore_future'] = level +  '_' + model['modelname'] + '_f' + str(FutureStart)
        try:
            predictions_future = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_future'])
        except KeyError:
            print(model['predstore_future'], ', run', run_id, 'does not exist, predicting')
            predictions_future = model['RunResult_future'].run.future_point_predict(FutureStart,model['RunResult_future'].data)
            predictions_future.forecasts.set_run(run_id)
            predictions_future.forecasts.to_store(name=model['predstore_future'])  
    print('**************************************************************')
    model['algorithm'] = []
    i = i + 1

print('All done')

0 fat_baseline_rf
Calibration partition 2022-06-20 13:13:17.378682
 * == Performing a run: "fat_baseline_rf_calib" == * 
Model object named "fat_baseline_rf_calib" with equivalent metadata already exists.
Fetching "fat_baseline_rf_calib" from storage
Trying to retrieve predictions 2022-06-20 13:13:32.968727
pr_46_cm_fat_baseline_rf_calib.parquet
Test partition 2022-06-20 13:13:37.201889
 * == Performing a run: "fat_baseline_rf_test" == * 
Model object named "fat_baseline_rf_test" with equivalent metadata already exists.
Fetching "fat_baseline_rf_test" from storage
Trying to retrieve predictions 2022-06-20 13:13:57.612008
pr_46_cm_fat_baseline_rf_test.parquet
**************************************************************
1 fat_conflicthistory_rf
Calibration partition 2022-06-20 13:14:02.163477
 * == Performing a run: "fat_conflicthistory_rf_calib" == * 
Model object named "fat_conflicthistory_rf_calib" with equivalent metadata already exists.
Fetching "fat_conflicthistory_rf_calib" from

 * == Performing a run: "fatalities002_greatest_hits_hurdle_xgb_test" == * 
Model object named "fatalities002_greatest_hits_hurdle_xgb_test" with equivalent metadata already exists.
Fetching "fatalities002_greatest_hits_hurdle_xgb_test" from storage
Trying to retrieve predictions 2022-06-20 13:20:08.028396
pr_46_cm_fatalities002_greatest_hits_hurdle_xgb_test.parquet
**************************************************************
11 fat_hh20_hurdle_rf
Calibration partition 2022-06-20 13:20:13.598545
 * == Performing a run: "fat_hh20_hurdle_rf_calib" == * 
Model object named "fat_hh20_hurdle_rf_calib" with equivalent metadata already exists.
Fetching "fat_hh20_hurdle_rf_calib" from storage
Trying to retrieve predictions 2022-06-20 13:20:46.419513
pr_46_cm_fat_hh20_hurdle_rf_calib.parquet
Test partition 2022-06-20 13:20:53.584562
 * == Performing a run: "fat_hh20_hurdle_rf_test" == * 
Model object named "fat_hh20_hurdle_rf_test" with equivalent metadata already exists.
Fetching "fat_hh20_h

Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_faoprices_rf_calib"
Trying to retrieve predictions 2022-06-20 13:40:05.158407
pr_46_cm_fatalities002_faoprices_rf_calib.parquet
cm_fatalities002_faoprices_rf_calib , run Fatalities002 does not exist, predicting
Test partition 2022-06-20 13:40:20.140313
 * == Performing a run: "fatalities002_faoprices_rf_test" == * 
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_faoprices_rf_test"
Trying to retrieve predictions 2022-06-20 13:43:19.462170
pr_46_cm_fatalities002_faoprices_rf_test.parquet
cm_fatalities002_faoprices_rf_test , run Fatalities002 does not exist, predicting
**************************************************************
18 fatalities002_imfweo_rf
Calibration partition 2022-06-20 13:43:34.729827
 * == Performing a run: "fatalities002_imfweo_rf_calib" == * 
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_imfweo_rf_calib"
Trying to retrieve predictions 2022-06-20 13:45:46.111270
pr_46_cm_fatalities002_imfweo_rf_calib.parquet
cm_fatalities002_imfweo_rf_calib , run Fatalities002 does not exist, predicting
Test partition 2022-06-20 13:46:02.185133
 * == Performing a run: "fatalities002_imfweo_rf_test" == * 
Training model(s)...


Reordering feature dimension. Save memory by setting the outcome feature as the first column in your dataframe.


Storing "fatalities002_imfweo_rf_test"
Trying to retrieve predictions 2022-06-20 13:48:41.728343
pr_46_cm_fatalities002_imfweo_rf_test.parquet
cm_fatalities002_imfweo_rf_test , run Fatalities002 does not exist, predicting
**************************************************************
All done


In [22]:
# Exploring the future predictions


predictions_test.xs(246,level=1).tail()

Unnamed: 0_level_0,imfweo_ngdp_rpch_tcurrent,imfweo_ngdp_rpch_tmin1,imfweo_ngdp_rpch_tplus1,imfweo_ngdp_rpch_tplus2,ln_ged_sb_dep,gleditsch_ward,ln_ged_sb,wdi_sp_pop_totl,decay_ged_sb_5,decay_ged_os_5,...,step_pred_33,step_pred_34,step_pred_35,step_pred_36,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9
month_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
488,0.0,0.0,0.0,0.0,3.218876,626.0,3.218876,11062113.0,1.0,1.0,...,3.913047,3.388252,3.466939,3.391691,1.709457,1.647634,1.139124,1.967838,2.335133,1.886065
489,0.0,0.0,0.0,0.0,0.693147,626.0,0.693147,11062113.0,1.0,0.971532,...,3.839852,3.781158,3.137412,3.756255,2.61308,1.548834,2.048799,1.024652,2.154551,1.861133
490,0.0,0.0,0.0,0.0,1.609438,626.0,1.609438,11062113.0,0.971532,0.943874,...,3.369635,3.725624,3.738224,3.355103,2.046697,2.755813,1.464292,1.73821,1.514066,1.852678
491,0.0,0.0,0.0,0.0,2.484907,626.0,2.484907,11062113.0,0.943874,0.917004,...,3.637645,3.567117,3.625956,3.974339,2.455626,1.9259,2.770793,1.607153,2.113373,1.534434
492,0.0,0.0,0.0,0.0,0.0,626.0,0.0,11062113.0,1.0,0.890899,...,3.243661,3.710055,3.410763,3.963485,2.856974,2.578416,2.112898,2.429112,1.426705,1.302743


## Notes on training time for the various algorithms:

In [23]:
#These are calculated in minutes for the hh20 feature set (with about 40 features), for all 36 steps, calibration (c) and test (t) partitions, also include generating predictions, and are approximate:

#nj=12 (number of threads)
#scikit random forest:        21:13 (c), 26:20 (t) RandomForestRegressor(n_estimators=200, n_jobs=nj)
#XGB random forest:           06:02 (c), 07:51 (t) XGBRFRegressor(n_estimators=300,n_jobs=nj)
#scikit gbm:                  13:59 (c), 15:55 (t) GradientBoostingRegressor(), 
#scikit hurdle random forest: 07:32 (c), 09:49 (t) For both clf and reg: (n_estimators=200, n_jobs=nj)
#XGB hurdle xgb:              01:26 (c), 01:32 (t) For both clf and reg:                n_estimators=200,tree_method='hist',n_jobs=nj)
#scikit histgbm:              01:17 (c), 01:20 (t) HistGradientBoostingRegressor(max_iter=200)
#XGB xgb:                     01:00 (c), 01:04 (t) XGBRegressor(n_estimators=200,tree_method='hist',n_jobs=nj)
#lightgbm gbm:                00:25 (c), --    (t) LGBMRegressor(n_estimators=100,num_threads=8)

# Various helper functions and tools....

In [24]:
!conda list | grep views-forecasts

views-forecasts           0.5.1                    pypi_0    pypi


# Retrieving external forecasts

In [25]:
# Retrieve David's Markov models
# To do: rewrite the model dictionary to the new, slimmer version.
DRList = []


model = {
    'modelname':   'fat_hh20_Markov_glm',
    'algorithm': [],
    'depvar': "ln_ged_sb_dep",
    'data_train':      'hh20',
    'queryset': 'hh_20_features',
}
ModelList.append(model)
DRList.append(model)

model = {
    'modelname':   'fat_hh20_Markov_rf',
    'algorithm': [],
    'depvar': "ln_ged_sb_dep",
    'data_train':      'hh20',
    'queryset': 'hh_20_features',
}
ModelList.append(model)
DRList.append(model)



In [26]:
path = '/Users/havardhegre/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/cm/preds/'

DRList[0]['predictions_file_calib'] = path + 'vmm_glm_hh20_0125_alt_calib.csv'
DRList[0]['predictions_file_test'] = path + 'vmm_glm_hh20_0125_alt_test.csv'
DRList[0]['predictions_file_future'] = path + 'vmm_glm_hh20_506.csv'

DRList[1]['predictions_file_calib'] = path + 'vmm_rf_hh20_0125_alt_calib.csv'
DRList[1]['predictions_file_test'] = path + 'vmm_rf_hh20_0125_alt_test.csv'
DRList[1]['predictions_file_future'] = path + 'vmm_rf_hh20_505.csv'

In [27]:
# Storing Markov models in central storage
# Retrieving dependent variable
target_calib = pd.DataFrame.forecasts.read_store('cm_fat_conflicthistory_rf_calib', run=run_id)['ln_ged_sb_dep']
target_test = pd.DataFrame.forecasts.read_store('cm_fat_conflicthistory_rf_test', run=run_id)['ln_ged_sb_dep']
level = 'cm'
for model in DRList:
    df_calib = pd.read_csv(model['predictions_file_calib'],index_col=['month_id','country_id'])
    df_test = pd.read_csv(model['predictions_file_test'],index_col=['month_id','country_id'])
    df_future = pd.read_csv(model['predictions_file_future'],index_col=['month_id','country_id'])
    df_calib['ln_ged_sb_dep'] = target_calib
    df_test['ln_ged_sb_dep'] = target_test
    df_future['ln_ged_sb_dep'] = np.nan # Empty dependent variable column for consistency/required by prediction storage function
    stored_modelname = level + '_' + model['modelname'] + '_calib'
    df_calib.forecasts.set_run(run_id)
    df_calib.forecasts.to_store(name=stored_modelname, overwrite=True)
    stored_modelname = level + '_' + model['modelname'] + '_test'
    df_test.forecasts.set_run(run_id)
    df_test.forecasts.to_store(name=stored_modelname, overwrite=True)    

pr_46_cm_fat_conflicthistory_rf_calib.parquet
pr_46_cm_fat_conflicthistory_rf_test.parquet


In [19]:
# Removing run result objects before saving
# These should perhaps be discarded immediately above
for model in ModelList[0:-3]:
    model['RunResult_calib'] = []
    model['RunResult_test'] = []

In [20]:
ModelList

[{'modelname': 'fat_baseline_rf',
  'algorithm': [],
  'depvar': 'ln_ged_sb_dep',
  'data_train': 'baseline',
  'queryset': 'hh_fatalities_ged_ln_ultrashort',
  'preprocessing': 'float_it',
  'Algorithm_text': "XGBRFRegressor(base_score=None, booster=None, colsample_bylevel=None,\n               colsample_bytree=None, enable_categorical=False, gamma=None,\n               gpu_id=None, importance_type=None, interaction_constraints=None,\n               max_delta_step=None, max_depth=None, min_child_weight=None,\n               missing=nan, monotone_constraints=None, n_estimators=300,\n               n_jobs=12, num_parallel_tree=None, objective='reg:squarederror',\n               predictor=None, random_state=None, reg_alpha=None,\n               scale_pos_weight=None, tree_method=None,\n               validate_parameters=None, verbosity=None)",
  'RunResult_calib': [],
  'predstore_calib': 'cm_fat_baseline_rf_calib',
  'RunResult_test': [],
  'predstore_test': 'cm_fat_baseline_rf_test'},


# Save the model list of dictionaries

In [28]:
ModelList_df = pd.DataFrame.from_dict(ModelList)
localpath = '/Users/havardhegre/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/'

filename = localpath + 'Model_cm_' + model['modelname'] + '_'+ dev_id + '.csv'
ModelList_df.to_csv(filename)
gitname = 'ModelList_cm_wide_' + dev_id + '.csv'
ModelList_df.to_csv(gitname)