
# ViEWS 3 constituent models 

## Fatalities project, pgm level


This notebook trains a set of regression models for use in a predicting fatalities ensemble


The notebook does the following: 
1. Retrieves data through querysets and stores in DataSets, a list of dictionaries
2. Specifies the metadata of a number of models, stores in ModelList, a list of dictionaries
3. Trains the models in ModelList, stores the trained objects in model storage and prediction storage


## Importing models

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from xgboost import XGBRFRegressor, XGBRFClassifier

from lightgbm import LGBMClassifier, LGBMRegressor

# Views 3
from viewser.operations import fetch
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
import views_dataviz
from views_runs import storage
from views_runs.storage import store, retrieve, fetch_metadata

from views_forecasts.extensions import *

# Other packages
import pickle as pkl

# Packages from Predicting Fatalies repository

from HurdleRegression import * # Built on script from Geoff Hurdock: https://geoffruddock.com/building-a-hurdle-regression-estimator-in-scikit-learn/
from Ensembling import CalibratePredictions, RetrieveStoredPredictions, mean_sd_calibrated, gam_calibrated
import FetchData
from FetchData import FetchData_pgm, RetrieveFromList, data_integrity_check, index_check
from HurdleRegression import *



## Common parameters

In [3]:
# Common parameters:
dev_id = 'Fatalities001'
run_id = 'Fatalities001'

# Generating a new run if necessary

#try:
#    ViewsMetadata().new_run(name=run_id,description='pgm_level_fatalities',min_month=1,max_month=999)
#except KeyError:
#    if 'devel' not in run_id:
#        warnings.warn('You are overwriting a production system')

depvar="ln_ged_sb_dep"

RerunQuerysets = True
        
FutureStart = 506
steps = [*range(1, 36+1, 1)] # Which steps to train and predict for
fi_steps = [1,3,6,12,36] # Which steps to present feature importances for
#steps = [1,3,6,12,36]
#fi_steps = [1,3,6,12,36]

# Specifying partitions
calib_partitioner_dict = {"train":(121,396),"predict":(397,444)}
test_partitioner_dict = {"train":(121,444),"predict":(445,492)}
future_partitioner_dict = {"train":(121,492),"predict":(493,504)}
calib_partitioner =  views_runs.DataPartitioner({"calib":calib_partitioner_dict})
test_partitioner =  views_runs.DataPartitioner({"test":test_partitioner_dict})
future_partitioner =  views_runs.DataPartitioner({"future":future_partitioner_dict})

Mydropbox = '/Users/jim/Dropbox (ViEWS)/ViEWS'

## Retrieve data

In [7]:
if RerunQuerysets:
    import pgm_querysets

 .    
A dataset with 8 columns, with data between t 1 and 852. (13110 units)
 .    
A dataset with 19 columns, with data between t 1 and 852. (13110 units)
 .    
A dataset with 29 columns, with data between t 1 and 852. (13110 units)
 .    
A dataset with 24 columns, with data between t 1 and 852. (13110 units)
 .    
A dataset with 23 columns, with data between t 1 and 852. (13110 units)
 .    
A dataset with 30 columns, with data between t 1 and 852. (13110 units)
 .    
A dataset with 8 columns, with data between t 1 and 852. (13110 units)
 .    
A dataset with 11 columns, with data between t 1 and 852. (13110 units)


In [4]:
Datasets = FetchData_pgm(dev_id)

Fetching data using querysets; returns as list of dictionaries containing datasets
 .    baseline: A dataset with 8 columns, with data between t = 1 and 852; 13110 units.
 .    conflictlong: A dataset with 19 columns, with data between t = 1 and 852; 13110 units.
 .    escwa_drought: A dataset with 29 columns, with data between t = 1 and 852; 13110 units.
 .    natsoc: A dataset with 24 columns, with data between t = 1 and 852; 13110 units.
 .    broad: A dataset with 23 columns, with data between t = 1 and 852; 13110 units.
 .    paola_conf_hist: A dataset with 30 columns, with data between t = 1 and 852; 13110 units.
 .    conf_treelag: A dataset with 8 columns, with data between t = 1 and 852; 13110 units.
 .    conf_sptime_dist: A dataset with 11 columns, with data between t = 1 and 852; 13110 units.


In [5]:
for ds in Datasets:

    FetchData.data_integrity_check(ds,depvar)

Reordering columns in model escwa_drought
Reordering columns in model broad
Reordering columns in model conf_treelag
Reordering columns in model conf_sptime_dist


## Define regressors

In [6]:
nj=12
n_estimators=200

rf_regressor=RandomForestRegressor(n_estimators=n_estimators, n_jobs=nj)

xgb_regressor=XGBRegressor(n_estimators=n_estimators,tree_method='hist',n_jobs=nj)

lgbm_regressor=LGBMRegressor(n_estimators=n_estimators)

clf_params={'n_estimators':n_estimators,'n_jobs':nj}

reg_params={'n_estimators':n_estimators,'n_jobs':nj}

clf_lgbm_params={'n_estimators':n_estimators}

reg_lgbm_params={'n_estimators':n_estimators}

hur_regressor=HurdleRegression(clf_name='XGBClassifier',reg_name='XGBRegressor',clf_params=clf_params,reg_params=reg_params)
hur_lgbm_regressor=HurdleRegression(clf_name='LGBMClassifier',reg_name='LGBMRegressor',clf_params=clf_lgbm_params,reg_params=reg_lgbm_params)

## Specify models

In [7]:
ModelList = []

model = {
    'modelname': 'fat_jed_hh_baseline_lgbm',
    'algorithm': lgbm_regressor,
    'depvar': depvar,
    'queryset':'hh_fat_pgm_baseline',
    'data_train': 'baseline'

    
}
ModelList.append(model)

model = {
    'modelname': 'fat_jed_hh_conflictlong_lgbm',
    'algorithm': lgbm_regressor,
    'depvar': depvar,
    'queryset':'hh_fat_pgm_conflictlong',
    'data_train': 'conflictlong'

}
ModelList.append(model)

model = {
    'modelname': 'fat_jed_hh_conflictlong_hurdle_lgbm',
    'algorithm': hur_lgbm_regressor,
    'depvar': depvar,
    'queryset':'hh_fat_pgm_conflictlong',
    'data_train': 'conflictlong'

}
ModelList.append(model)

model = {
    'modelname': 'fat_jed_hh_drought_hurdle_lgbm',
    'algorithm': hur_lgbm_regressor,
    'depvar': depvar,
    'queryset':'fat_escwa_drought_vulnerability_pgm',
    'data_train': 'escwa_drought'

}
ModelList.append(model)

model = {
    'modelname': 'fat_jed_hh_drought_lgbm',
    'algorithm': lgbm_regressor,
    'depvar': depvar,
    'queryset':'fat_escwa_drought_vulnerability_pgm',
    'data_train': 'escwa_drought'
    
}
ModelList.append(model)

model = {
    'modelname': 'fat_jed_hh_natsoc_hurdle_lgbm',
    'algorithm': hur_lgbm_regressor,
    'depvar': depvar,
    'queryset':'hh_fat_pgm_natsoc',
    'data_train': 'natsoc'
    
}
ModelList.append(model)

model = {
    'modelname': 'fat_jed_hh_natsoc_lgbm',
    'algorithm': lgbm_regressor,
    'depvar': depvar,
    'queryset':'hh_fat_pgm_natsoc',
    'data_train': 'natsoc'

}
ModelList.append(model)

model = {
    'modelname': 'fat_jed_hh_broad_hurdle_lgbm',
    'algorithm': hur_lgbm_regressor,
    'depvar': depvar,
    'queryset':'hh_fat_pgm_broad',
    'data_train': 'broad'

}
ModelList.append(model)

model = {
    'modelname': 'fat_jed_hh_broad_lgbm',
    'algorithm': lgbm_regressor,
    'depvar': depvar,
    'queryset':'hh_fat_pgm_broad',
    'data_train': 'broad'

}
ModelList.append(model)

model = {
    'modelname': 'fat_jed_pv_conf_hist_pgm',
    'algorithm': xgb_regressor,
    'depvar': depvar,
    'queryset': 'paola_fatalities_conflict_history',
    'data_train': 'paola_conf_hist'

}

ModelList.append(model)

model = {
    'modelname': 'fat_tree_lags_d_1_d_2_hur_36',
    'algorithm': hur_regressor,
    'depvar': depvar,
    'queryset':'jim_pgm_conflict_treelag_d_1_d_2',
    'data_train': 'conf_treelag'

}
ModelList.append(model)

model = {
    'modelname': 'fat_sptime_dist_nu1_10_001_hur_36',
    'algorithm': hur_regressor,
    'depvar': depvar,
    'queryset':'jim_pgm_conflict_target_sptime_dist_nu1_10_001',
    'data_train': 'conf_sptime_dist'

}
ModelList.append(model)

In [8]:
# Loop that checks whether the model exists, retrains if not, 
# and stores the predictions if they have not been stored before for this run.
# To do: set the data_preprocessing to the function in the model dictionary

level = 'pgm'
includeFuture = True

from views_runs import Storage, StepshiftedModels
from views_partitioning.data_partitioner import DataPartitioner
from viewser import Queryset, Column
from views_runs import operations
from views_runs.run_result import RunResult

i = 0
for model in ModelList:
    force_retrain = False
    modelstore = storage.Storage()
    ct = datetime.now()
    print(i, model['modelname'])
    print('Calibration partition', ct)
    model['Algorithm_text'] = str(model['algorithm'])
    model['RunResult_calib'] = RunResult.retrain_or_retrieve(
            retrain            = force_retrain,
            store              = modelstore,
            partitioner        = DataPartitioner({"calib":calib_partitioner_dict}),
            stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
            dataset            = RetrieveFromList(Datasets,model['data_train']),
            queryset_name      = model['queryset'],
            partition_name     = "calib",
            timespan_name      = "train",
            storage_name       = model['modelname'] + '_calib',
            author_name        = "JED",
    )

    model['predstore_calib'] = level +  '_' + model['modelname'] + '_calib'
    ct = datetime.now()
    print('Trying to retrieve predictions', ct)
    try:
        predictions_calib = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_calib'])
    except KeyError:
        print(model['predstore_calib'], ', run',  run_id, 'does not exist, predicting')
        predictions_calib = model['RunResult_calib'].run.predict("calib","predict", model['RunResult_calib'].data)
        predictions_calib.forecasts.set_run(run_id)
        predictions_calib.forecasts.to_store(name=model['predstore_calib'])

    ct = datetime.now()
    print('Test partition', ct)
    modelstore = storage.Storage()
    model['RunResult_test'] = RunResult.retrain_or_retrieve(
            retrain            = force_retrain,
            store              = modelstore,
            partitioner        = DataPartitioner({"test":test_partitioner_dict}),
            stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
            dataset            = RetrieveFromList(Datasets,model['data_train']),
            queryset_name      = model['queryset'],
            partition_name     = "test",
            timespan_name      = "train",
            storage_name       = model['modelname'] + '_test',
            author_name        = "JED",
    )
    ct = datetime.now()
    print('Trying to retrieve predictions', ct)
    model['predstore_test'] = level +  '_' + model['modelname'] + '_test'
    try:
        predictions_test = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_test'])
    except KeyError:
        print(model['predstore_test'], ', run', run_id, 'does not exist, predicting')
        predictions_test = model['RunResult_test'].run.predict("test","predict",model['RunResult_test'].data)
        predictions_test.forecasts.set_run(run_id)
        predictions_test.forecasts.to_store(name=model['predstore_test'])
    # Predictions for true future
    if includeFuture:
        ct = datetime.now()
        print('Future', ct)
        modelstore = storage.Storage()
        model['RunResult_future'] = RunResult.retrain_or_retrieve(
                retrain            = force_retrain,
                store              = modelstore,
                partitioner        = DataPartitioner({"test":future_partitioner_dict}),
                stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                dataset            = RetrieveFromList(Datasets,model['data_train']),
                queryset_name      = model['queryset'],
                partition_name     = "test",
                timespan_name      = "train",
                storage_name       = model['modelname'] + '_future',
                author_name        = "JED",
        )
        ct = datetime.now()
        print('Trying to retrieve predictions', ct)
        model['predstore_future'] = level +  '_' + model['modelname'] + '_f' + str(FutureStart)
        try:
            predictions_future = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_future'])
        except KeyError:
            print(model['predstore_future'], ', run', run_id, 'does not exist, predicting')
            predictions_future = model['RunResult_future'].run.future_point_predict(FutureStart,model['RunResult_future'].data)
            predictions_future.forecasts.set_run(run_id)
            predictions_future.forecasts.to_store(name=model['predstore_future'])  
#    model['algorithm'] = []
    i = i + 1

print('All done')

0 fat_jed_hh_baseline_lgbm
Calibration partition 2022-05-29 15:35:11.299896
 * == Performing a run: "fat_jed_hh_baseline_lgbm_calib" == * 
Model object named "fat_jed_hh_baseline_lgbm_calib" with equivalent metadata already exists.
Fetching "fat_jed_hh_baseline_lgbm_calib" from storage
Trying to retrieve predictions 2022-05-29 15:35:13.098795
pr_45_pgm_fat_jed_hh_baseline_lgbm_calib.parquet
Test partition 2022-05-29 15:35:20.130285
 * == Performing a run: "fat_jed_hh_baseline_lgbm_test" == * 
Model object named "fat_jed_hh_baseline_lgbm_test" with equivalent metadata already exists.
Fetching "fat_jed_hh_baseline_lgbm_test" from storage
Trying to retrieve predictions 2022-05-29 15:35:22.110882
pr_45_pgm_fat_jed_hh_baseline_lgbm_test.parquet
Future 2022-05-29 15:35:28.832478
 * == Performing a run: "fat_jed_hh_baseline_lgbm_future" == * 
Model object named "fat_jed_hh_baseline_lgbm_future" with equivalent metadata already exists.
Fetching "fat_jed_hh_baseline_lgbm_future" from storage
Tr

Model object named "fat_jed_hh_broad_hurdle_lgbm_calib" with equivalent metadata already exists.
Fetching "fat_jed_hh_broad_hurdle_lgbm_calib" from storage
Trying to retrieve predictions 2022-05-29 15:38:49.644375
pr_45_pgm_fat_jed_hh_broad_hurdle_lgbm_calib.parquet
Test partition 2022-05-29 15:39:16.426601
 * == Performing a run: "fat_jed_hh_broad_hurdle_lgbm_test" == * 
Model object named "fat_jed_hh_broad_hurdle_lgbm_test" with equivalent metadata already exists.
Fetching "fat_jed_hh_broad_hurdle_lgbm_test" from storage
Trying to retrieve predictions 2022-05-29 15:39:20.740431
pr_45_pgm_fat_jed_hh_broad_hurdle_lgbm_test.parquet
Future 2022-05-29 15:39:45.587978
 * == Performing a run: "fat_jed_hh_broad_hurdle_lgbm_future" == * 
Model object named "fat_jed_hh_broad_hurdle_lgbm_future" with equivalent metadata already exists.
Fetching "fat_jed_hh_broad_hurdle_lgbm_future" from storage
Trying to retrieve predictions 2022-05-29 15:39:49.730025
pr_45_pgm_fat_jed_hh_broad_hurdle_lgbm_f506

In [9]:
ModelMetaData = pd.DataFrame(ModelList)

In [10]:
ModelMetaData

Unnamed: 0,modelname,algorithm,depvar,queryset,data_train,Algorithm_text,RunResult_calib,predstore_calib,RunResult_test,predstore_test,RunResult_future,predstore_future
0,fat_jed_hh_baseline_lgbm,LGBMRegressor(n_estimators=200),ln_ged_sb_dep,hh_fat_pgm_baseline,baseline,LGBMRegressor(n_estimators=200),RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_baseline_lgbm_calib,RunResult(training_date = 2022-05-16),pgm_fat_jed_hh_baseline_lgbm_test,RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_baseline_lgbm_f506
1,fat_jed_hh_conflictlong_lgbm,LGBMRegressor(n_estimators=200),ln_ged_sb_dep,hh_fat_pgm_conflictlong,conflictlong,LGBMRegressor(n_estimators=200),RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_conflictlong_lgbm_calib,RunResult(training_date = 2022-05-17),pgm_fat_jed_hh_conflictlong_lgbm_test,RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_conflictlong_lgbm_f506
2,fat_jed_hh_conflictlong_hurdle_lgbm,"HurdleRegression(clf_name='LGBMClassifier', cl...",ln_ged_sb_dep,hh_fat_pgm_conflictlong,conflictlong,"HurdleRegression(clf_name='LGBMClassifier', cl...",RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_conflictlong_hurdle_lgbm_calib,RunResult(training_date = 2022-05-17),pgm_fat_jed_hh_conflictlong_hurdle_lgbm_test,RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_conflictlong_hurdle_lgbm_f506
3,fat_jed_hh_drought_hurdle_lgbm,"HurdleRegression(clf_name='LGBMClassifier', cl...",ln_ged_sb_dep,fat_escwa_drought_vulnerability_pgm,escwa_drought,"HurdleRegression(clf_name='LGBMClassifier', cl...",RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_drought_hurdle_lgbm_calib,RunResult(training_date = 2022-05-17),pgm_fat_jed_hh_drought_hurdle_lgbm_test,RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_drought_hurdle_lgbm_f506
4,fat_jed_hh_drought_lgbm,LGBMRegressor(n_estimators=200),ln_ged_sb_dep,fat_escwa_drought_vulnerability_pgm,escwa_drought,LGBMRegressor(n_estimators=200),RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_drought_lgbm_calib,RunResult(training_date = 2022-05-18),pgm_fat_jed_hh_drought_lgbm_test,RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_drought_lgbm_f506
5,fat_jed_hh_natsoc_hurdle_lgbm,"HurdleRegression(clf_name='LGBMClassifier', cl...",ln_ged_sb_dep,hh_fat_pgm_natsoc,natsoc,"HurdleRegression(clf_name='LGBMClassifier', cl...",RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_natsoc_hurdle_lgbm_calib,RunResult(training_date = 2022-05-18),pgm_fat_jed_hh_natsoc_hurdle_lgbm_test,RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_natsoc_hurdle_lgbm_f506
6,fat_jed_hh_natsoc_lgbm,LGBMRegressor(n_estimators=200),ln_ged_sb_dep,hh_fat_pgm_natsoc,natsoc,LGBMRegressor(n_estimators=200),RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_natsoc_lgbm_calib,RunResult(training_date = 2022-05-18),pgm_fat_jed_hh_natsoc_lgbm_test,RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_natsoc_lgbm_f506
7,fat_jed_hh_broad_hurdle_lgbm,"HurdleRegression(clf_name='LGBMClassifier', cl...",ln_ged_sb_dep,hh_fat_pgm_broad,broad,"HurdleRegression(clf_name='LGBMClassifier', cl...",RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_broad_hurdle_lgbm_calib,RunResult(training_date = 2022-05-18),pgm_fat_jed_hh_broad_hurdle_lgbm_test,RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_broad_hurdle_lgbm_f506
8,fat_jed_hh_broad_lgbm,LGBMRegressor(n_estimators=200),ln_ged_sb_dep,hh_fat_pgm_broad,broad,LGBMRegressor(n_estimators=200),RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_broad_lgbm_calib,RunResult(training_date = 2022-05-18),pgm_fat_jed_hh_broad_lgbm_test,RunResult(training_date = 2022-02-23),pgm_fat_jed_hh_broad_lgbm_f506
9,fat_jed_pv_conf_hist_pgm,"XGBRegressor(base_score=None, booster=None, co...",ln_ged_sb_dep,paola_fatalities_conflict_history,paola_conf_hist,"XGBRegressor(base_score=None, booster=None, co...",RunResult(training_date = 2022-03-18),pgm_fat_jed_pv_conf_hist_pgm_calib,RunResult(training_date = 2022-05-18),pgm_fat_jed_pv_conf_hist_pgm_test,RunResult(training_date = 2022-03-18),pgm_fat_jed_pv_conf_hist_pgm_f506


In [11]:
ModelList_df = pd.DataFrame.from_dict(ModelList)
localpath = './'

filename = localpath + 'Model_pgm_' + model['modelname'] + '_'+ dev_id + '.csv'
ModelList_df.to_csv(filename)
gitname = 'ModelList_pgm_wide_' + dev_id + '.csv'
ModelList_df.to_csv(gitname)