
# ViEWS 3 constituent models 

## ViEWS production system, pgm level


This notebook trains a set of regression models for use in the monthly updated ViEWS predicting fatalities ensemble

The notebook does the following: 
1. Retrieves data through querysets and stores in DataSets, a list of dictionaries
2. Specifies the metadata of a number of models, stores in ModelList, a list of dictionaries
3. Trains the models in ModelList, stores the trained objects in model storage and prediction storage
4. Saves part of ModelList as csv and the rest as pickles


## Importing models

In [1]:
! viewser config list


  |:---------------------------------|:-----------------------------------------------------------------------------------------|
  | RETRY_FREQUENCY                  | 5                                                                                        |
  | LOG_LEVEL                        | INFO                                                                                     |
  | HANDSHAKE_PATH                   |                                                                                          |
  | REPO_URL                         | https://www.github.com/prio-data/viewser                                                 |
  | LATEST_KNOWN_VERSION             | 0.0.0                                                                                    |
  | NOTEBOOK_SERVER_IMAGE_REPOSITORY | prio-data/viewserspace                                                                   |
  | NOTEBOOK_SERVER_IMAGE_REGISTRY   | viewsregistry.azurecr.io                  

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Basics
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import matplotlib.cbook as cbook
# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from xgboost import XGBRFRegressor, XGBRFClassifier

from lightgbm import LGBMClassifier, LGBMRegressor

# Views 3
from viewser.operations import fetch
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
from views_runs import storage
from views_runs.storage import store, retrieve, fetch_metadata

from views_forecasts.extensions import *

# Other packages
import pickle as pkl

# Packages from Predicting Fatalies repository

import os
import sys
sys.path.append('../')
sys.path.append('../Tools')
sys.path.append('../Intermediates')
from FetchData import FetchData, RetrieveFromList, document_queryset, ReturnQsList, document_ensemble,data_integrity_check
from ViewsEstimators import *

## Common parameters

In [4]:
# Common parameters:
dev_id = 'Fatalities002'
run_id = dev_id

# Generating a new run if necessary

#try:
#    ViewsMetadata().new_run(name=run_id,description='pgm_level_fatalities',min_month=1,max_month=999)
#except KeyError:
#    if 'devel' not in run_id:
#        warnings.warn('You are overwriting a production system')

depvar="ln_ged_sb_dep"

RerunQuerysets = True
        
FutureStart = 518
steps = [*range(1, 36+1, 1)] # Which steps to train and predict for
fi_steps = [1,3,6,12,36] # Which steps to present feature importances for
#steps = [1,3,6,12,36]
#fi_steps = [1,3,6,12,36]

# Specifying partitions
calib_partitioner_dict = {"train":(121,408),"predict":(409,456)}
test_partitioner_dict = {"train":(121,456),"predict":(457,504)}
future_partitioner_dict = {"train":(121,504),"predict":(505,516)}
calib_partitioner =  views_runs.DataPartitioner({"calib":calib_partitioner_dict})
test_partitioner =  views_runs.DataPartitioner({"test":test_partitioner_dict})
future_partitioner =  views_runs.DataPartitioner({"future":future_partitioner_dict})

Mydropbox = f'/Users/{os.getlogin()}/Dropbox (ViEWS)/ViEWS'
print('Setting Mydropbox to',Mydropbox)

Setting Mydropbox to /Users/root/Dropbox (ViEWS)/ViEWS


## Retrieve data

In [5]:
# Create Markdown documentation of all querysets used
level = 'pgm'
qslist = ReturnQsList(level)
document_queryset(qslist,dev_id)

Model:  fatalities002_pgm_baseline
Model:  fatalities002_pgm_conflictlong
Model:  fatalities002_pgm_escwa_drought
Model:  fatalities002_pgm_natsoc
Model:  fatalities002_pgm_broad
Model:  fatalities002_pgm_conflict_history
Model:  fatalities002_pgm_conflict_treelag
Model:  fatalities002_pgm_conflict_sptime_dist


In [6]:
#if RerunQuerysets:
#    import pgm_querysets

In [7]:
from FetchData import fetch_pgm_data_from_model_def

Datasets=fetch_pgm_data_from_model_def(qslist)

32: "Queryset fatalities002_pgm_conflict_treelag *transform in progress* - 2 of 31 jobs remaining"        

100%|██████████| 297M/297M [00:41<00:00, 7.12MB/s] 


Queryset fatalities002_pgm_conflict_treelag read successfully                                      
conflicttreelag: A dataset with 8 columns, with data between t = 1 and 852; 13110 units.
112: "Queryset fatalities002_pgm_broad *transform in progress* - 4 of 105 jobs remaining"       

100%|██████████| 316M/316M [00:44<00:00, 7.07MB/s] 


Queryset fatalities002_pgm_broad read successfully                                        
broad: A dataset with 23 columns, with data between t = 1 and 852; 13110 units.
88: "Queryset fatalities002_pgm_conflict_sptime_dist *transform in progress* - 6 of 43 jobs remaining"       

100%|██████████| 109M/109M [00:09<00:00, 11.1MB/s] 


Queryset fatalities002_pgm_conflict_sptime_dist read successfully                                      
conflictsptime_dist: A dataset with 11 columns, with data between t = 1 and 852; 13110 units.
15: "Queryset fatalities002_pgm_conflictlong *transform in progress* - 2 of 121 jobs remaining"       

100%|██████████| 112M/112M [00:10<00:00, 10.7MB/s] 


Queryset fatalities002_pgm_conflictlong read successfully                                       
conflictlong: A dataset with 19 columns, with data between t = 1 and 852; 13110 units.
61: "Queryset fatalities002_pgm_conflict_history *transform in progress* - 6 of 169 jobs remaining"        

100%|██████████| 26.3M/26.3M [00:02<00:00, 10.6MB/s]


Queryset fatalities002_pgm_conflict_history read successfully                                       
conflicthist: A dataset with 30 columns, with data between t = 1 and 852; 13110 units.
8: "Queryset fatalities002_pgm_baseline *transform in progress* - 4 of 50 jobs remaining"        

100%|██████████| 106M/106M [00:09<00:00, 10.8MB/s] 


Queryset fatalities002_pgm_baseline read successfully                                     
baseline: A dataset with 8 columns, with data between t = 1 and 852; 13110 units.
46: "Queryset fatalities002_pgm_natsoc *transform in progress* - 3 of 119 jobs remaining"        

100%|██████████| 231M/231M [00:27<00:00, 8.45MB/s] 


Queryset fatalities002_pgm_natsoc read successfully                                       
natsoc: A dataset with 24 columns, with data between t = 1 and 852; 13110 units.
59: "Queryset fatalities002_pgm_escwa_drought *transform in progress* - 8 of 106 jobs remaining"        

100%|██████████| 264M/264M [00:25<00:00, 10.2MB/s] 


Queryset fatalities002_pgm_escwa_drought read successfully                                       
escwa_drought: A dataset with 29 columns, with data between t = 1 and 852; 13110 units.


In [8]:
for ds in Datasets:

    data_integrity_check(ds,depvar)

Reordering columns in model conflicttreelag
Reordering columns in model broad
Reordering columns in model conflictsptime_dist
Reordering columns in model escwa_drought


# Generating predictions
Using the ViEWS3 partitioning/stepshifting syntax. Training models for A: calibration partition and B: test partition, to test out some calibration routines. Most models trained with ln_ged_sb_best as outcome.

In [9]:
dev_id

'Fatalities002'

# Specify models in ensemble

In [10]:
from ModelDefinitions import DefineEnsembleModels

ModelList = DefineEnsembleModels('pgm')
    

for imodel,model in enumerate(ModelList):
    print(imodel, model['modelname'], model['data_train'])

0 fatalities002_pgm_baseline_lgbm baseline
1 fatalities002_pgm_conflictlong_lgbm conflictlong
2 fatalities002_pgm_conflictlong_hurdle_lgbm conflictlong
3 fatalities002_pgm_escwa_drought_hurdle_lgbm escwa_drought
4 fatalities002_pgm_escwa_drought_lgbm escwa_drought
5 fatalities002_pgm_natsoc_hurdle_lgbm natsoc
6 fatalities002_pgm_natsoc_lgbm natsoc
7 fatalities002_pgm_broad_hurdle_lgbm broad
8 fatalities002_pgm_broad_lgbm broad
9 fatalities002_pgm_conflict_history_xgb conflicthist
10 fatalities002_pgm_conflict_treelag_hurdle conflicttreelag
11 fatalities002_pgm_conflict_sptime_dist_hurdle conflictsptime_dist


In [11]:
document_ensemble(ModelList,'sb')

0 fatalities002_pgm_baseline_lgbm baseline
1 fatalities002_pgm_conflictlong_lgbm conflictlong
2 fatalities002_pgm_conflictlong_hurdle_lgbm conflictlong
3 fatalities002_pgm_escwa_drought_hurdle_lgbm escwa_drought
4 fatalities002_pgm_escwa_drought_lgbm escwa_drought
5 fatalities002_pgm_natsoc_hurdle_lgbm natsoc
6 fatalities002_pgm_natsoc_lgbm natsoc
7 fatalities002_pgm_broad_hurdle_lgbm broad
8 fatalities002_pgm_broad_lgbm broad
9 fatalities002_pgm_conflict_history_xgb conflicthist
10 fatalities002_pgm_conflict_treelag_hurdle conflicttreelag
11 fatalities002_pgm_conflict_sptime_dist_hurdle conflictsptime_dist


In [12]:
# Loop that checks whether the model exists, retrains if not, 
# and stores the predictions if they have not been stored before for this run.
# To do: set the data_preprocessing to the function in the model dictionary

level = 'pgm'
includeFuture = False
force_rewrite = True
force_retrain = True
store_remote = False

from views_runs import Storage, StepshiftedModels
from views_partitioning.data_partitioner import DataPartitioner
from viewser import Queryset, Column
from views_runs import operations
from views_runs.run_result import RunResult

i = 0
for model in ModelList[10:]:
    modelstore = storage.Storage()
    ct = datetime.now()
    print(i, model['modelname'])
    print('Calibration partition', ct)
    model['Algorithm_text'] = str(model['algorithm'])
    model['RunResult_calib'] = RunResult.retrain_or_retrieve(
            retrain            = force_retrain,
            store              = modelstore,
            partitioner        = DataPartitioner({"calib":calib_partitioner_dict}),
            stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
            dataset            = RetrieveFromList(Datasets,model['data_train']),
            queryset_name      = model['queryset'],
            partition_name     = "calib",
            timespan_name      = "train",
            storage_name       = model['modelname'] + '_calib',
            author_name        = "JED",
    )

    model['predstore_calib'] = level +  '_' + model['modelname'] + '_calib'
    ct = datetime.now()
    if force_rewrite:
        print(model['predstore_calib'], ', run',  run_id, 'force_rewrite=True, predicting')
        predictions_calib = model['RunResult_calib'].run.predict("calib","predict", model['RunResult_calib'].data)

        predictions_calib.to_parquet(model['predstore_calib']+'.parquet')
        if store_remote:
            predictions_calib.forecasts.set_run(run_id)
            predictions_calib.forecasts.to_store(name=model['predstore_calib'],overwrite=True)
    else:
        print('Trying to retrieve predictions', ct)
        try:
            predictions_calib = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_calib'])
        except KeyError:
            print(model['predstore_calib'], ', run',  run_id, 'does not exist, predicting')
            predictions_calib = model['RunResult_calib'].run.predict("calib","predict", model['RunResult_calib'].data)
            predictions_calib.forecasts.set_run(run_id)
            predictions_calib.forecasts.to_store(name=model['predstore_calib'])
                
    ct = datetime.now()
    print('Test partition', ct)
    modelstore = storage.Storage()
    model['RunResult_test'] = RunResult.retrain_or_retrieve(
            retrain            = force_retrain,
            store              = modelstore,
            partitioner        = DataPartitioner({"test":test_partitioner_dict}),
            stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
            dataset            = RetrieveFromList(Datasets,model['data_train']),
            queryset_name      = model['queryset'],
            partition_name     = "test",
            timespan_name      = "train",
            storage_name       = model['modelname'] + '_test',
            author_name        = "JED",
    )
    ct = datetime.now()
    if force_rewrite:
        print(model['predstore_test'], ', run',  run_id, 'force_rewrite=True, predicting')
        predictions_test = model['RunResult_test'].run.predict("test","predict", model['RunResult_test'].data)
        
        predictions_test.to_parquet(model['predstore_test']+'.parquet')
        if store_remote:
            predictions_test.forecasts.set_run(run_id)
            predictions_test.forecasts.to_store(name=model['predstore_test'],overwrite=True)
    else:
        print('Trying to retrieve predictions', ct)
    #    model['predstore_test'] = level +  '_' + model['modelname'] + '_test'
        try:
            predictions_test = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_test'])
        except KeyError:
            print(model['predstore_test'], ', run', run_id, 'does not exist, predicting')
            predictions_test = model['RunResult_test'].run.predict("test","predict",model['RunResult_test'].data)
            predictions_test.forecasts.set_run(run_id)
            predictions_test.forecasts.to_store(name=model['predstore_test'])
    # Predictions for true future
    if includeFuture:
        ct = datetime.now()
        print('Future', ct)
        modelstore = storage.Storage()
        model['RunResult_future'] = RunResult.retrain_or_retrieve(
                retrain            = force_retrain,
                store              = modelstore,
                partitioner        = DataPartitioner({"test":future_partitioner_dict}),
                stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                dataset            = RetrieveFromList(Datasets,model['data_train']),
                queryset_name      = model['queryset'],
                partition_name     = "test",
                timespan_name      = "train",
                storage_name       = model['modelname'] + '_future',
                author_name        = "JED",
        )
        ct = datetime.now()
        if force_rewrite:
            print(model['predstore_future'], ', run',  run_id, 'force_rewrite=True, predicting')
            predictions_future = model['RunResult_future'].run.predict(FutureStart, model['RunResult_future'].data)
            predictions_future.to_parquet(model['predstore_future']+'.parquet')

            if store_remote:
                predictions_future.forecasts.set_run(run_id)
                predictions_future.forecasts.to_store(name=model['predstore_future'],overwrite=True)
        else:
            print('Trying to retrieve predictions', ct)
            model['predstore_future'] = level +  '_' + model['modelname'] + '_f' + str(FutureStart)
            try:
                predictions_future = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_future'])
            except KeyError:
                print(model['predstore_future'], ', run', run_id, 'does not exist, predicting')
                predictions_future = model['RunResult_future'].run.future_point_predict(FutureStart,model['RunResult_future'].data)
                predictions_future.forecasts.set_run(run_id)
                predictions_future.forecasts.to_store(name=model['predstore_future'])  
#    model['algorithm'] = []
    i = i + 1

print('All done')

0 fatalities002_pgm_conflict_treelag_hurdle
Calibration partition 2024-03-28 21:25:47.241331
 * == Performing a run: "fatalities002_pgm_conflict_treelag_hurdle_calib" == * 
Model object named "fatalities002_pgm_conflict_treelag_hurdle_calib" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_pgm_conflict_treelag_hurdle_calib"
Training model(s)...
Storing "fatalities002_pgm_conflict_treelag_hurdle_calib"
pgm_fatalities002_pgm_conflict_treelag_hurdle_calib , run Fatalities002 force_rewrite=True, predicting
Test partition 2024-03-28 21:41:15.595800
 * == Performing a run: "fatalities002_pgm_conflict_treelag_hurdle_test" == * 
Model object named "fatalities002_pgm_conflict_treelag_hurdle_test" with equivalent metadata already exists.
Retrain is true, overwriting "fatalities002_pgm_conflict_treelag_hurdle_test"
Training model(s)...
Storing "fatalities002_pgm_conflict_treelag_hurdle_test"
pgm_fatalities002_pgm_conflict_treelag_hurdle_test , run Fatalities002

In [13]:
ModelMetaData = pd.DataFrame(ModelList)

In [14]:
ModelMetaData

Unnamed: 0,modelname,algorithm,depvar,queryset,data_train,level,preprocessing,description,long_description,predstore_calib,predstore_test,Algorithm_text,RunResult_calib,RunResult_test
0,fatalities002_pgm_baseline_lgbm,LGBMRegressor(n_estimators=200),ln_ged_sb_dep,fatalities002_pgm_baseline,baseline,pgm,float_it,,,pgm_fatalities002_pgm_baseline_lgbm_calib,pgm_fatalities002_pgm_baseline_lgbm_test,,,
1,fatalities002_pgm_conflictlong_lgbm,LGBMRegressor(n_estimators=200),ln_ged_sb_dep,fatalities002_pgm_conflictlong,conflictlong,pgm,float_it,,,pgm_fatalities002_pgm_conflictlong_lgbm_calib,pgm_fatalities002_pgm_conflictlong_lgbm_test,,,
2,fatalities002_pgm_conflictlong_hurdle_lgbm,"HurdleRegression(clf_name='LGBMClassifier', cl...",ln_ged_sb_dep,fatalities002_pgm_conflictlong,conflictlong,pgm,float_it,,,pgm_fatalities002_pgm_conflictlong_hurdle_lgbm...,pgm_fatalities002_pgm_conflictlong_hurdle_lgbm...,,,
3,fatalities002_pgm_escwa_drought_hurdle_lgbm,"HurdleRegression(clf_name='LGBMClassifier', cl...",ln_ged_sb_dep,fatalities002_pgm_escwa_drought,escwa_drought,pgm,float_it,,,pgm_fatalities002_pgm_escwa_drought_hurdle_lgb...,pgm_fatalities002_pgm_escwa_drought_hurdle_lgb...,,,
4,fatalities002_pgm_escwa_drought_lgbm,LGBMRegressor(n_estimators=200),ln_ged_sb_dep,fatalities002_pgm_escwa_drought,escwa_drought,pgm,float_it,,,pgm_fatalities002_pgm_escwa_drought_lgbm_calib,pgm_fatalities002_pgm_escwa_drought_lgbm_test,,,
5,fatalities002_pgm_natsoc_hurdle_lgbm,"HurdleRegression(clf_name='LGBMClassifier', cl...",ln_ged_sb_dep,fatalities002_pgm_natsoc,natsoc,pgm,float_it,,,pgm_fatalities002_pgm_natsoc_hurdle_lgbm_calib,pgm_fatalities002_pgm_natsoc_hurdle_lgbm_test,,,
6,fatalities002_pgm_natsoc_lgbm,LGBMRegressor(n_estimators=200),ln_ged_sb_dep,fatalities002_pgm_natsoc,natsoc,pgm,float_it,,,pgm_fatalities002_pgm_natsoc_lgbm_calib,pgm_fatalities002_pgm_natsoc_lgbm_test,,,
7,fatalities002_pgm_broad_hurdle_lgbm,"HurdleRegression(clf_name='LGBMClassifier', cl...",ln_ged_sb_dep,fatalities002_pgm_broad,broad,pgm,float_it,,,pgm_fatalities002_pgm_broad_hurdle_lgbm_calib,pgm_fatalities002_pgm_broad_hurdle_lgbm_test,,,
8,fatalities002_pgm_broad_lgbm,LGBMRegressor(n_estimators=200),ln_ged_sb_dep,fatalities002_pgm_broad,broad,pgm,float_it,,,pgm_fatalities002_pgm_broad_lgbm_calib,pgm_fatalities002_pgm_broad_lgbm_test,,,
9,fatalities002_pgm_conflict_history_xgb,"XGBRegressor(base_score=None, booster=None, ca...",ln_ged_sb_dep,fatalities002_pgm_conflict_history,conflicthist,pgm,float_it,,,pgm_fatalities002_pgm_conflict_history_xgb_calib,pgm_fatalities002_pgm_conflict_history_xgb_test,,,
