
# ViEWS 3 constituent models 
## ViEWS production system, cm level


This notebook trains a set of regression models for use in the monthly updated ViEWS predicting fatalities ensemble

The notebook does the following: 
1. Retrieves data through querysets and stores in DataSets, a list of dictionaries
2. Specifies the metadata of a number of models, stores in ModelList, a list of dictionaries
3. Trains the models in ModelList, stores the trained objects in model storage and prediction storage
4. Saves part of ModelList as csv and the rest as pickles

## Importing modules

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:

# Basics
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import matplotlib.cbook as cbook
# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from xgboost import XGBRFRegressor, XGBRFClassifier

from lightgbm import LGBMClassifier, LGBMRegressor

# Views 3
from viewser.operations import fetch
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
from views_runs import storage
from views_runs.storage import store, retrieve, fetch_metadata

from views_forecasts.extensions import *

# Other packages
import pickle as pkl

# Packages from viewsforecasting repository

#from Ensembling import CalibratePredictions, RetrieveStoredPredictions, mean_sd_calibrated, gam_calibrated
import os
import sys
sys.path.append('../')
sys.path.append('../Tools')
sys.path.append('../Intermediates')
from FetchData import FetchData, RetrieveFromList, document_queryset, ReturnQsList, document_ensemble
from ViewsEstimators import *


In [None]:
import sklearn
print(sklearn.__version__)

## Common parameters

In [None]:
# Common parameters:
dev_id = 'Fatalities002'
run_id = dev_id

# Generating a new run if necessary

#try:
#    ViewsMetadata().new_run(name=run_id,description='Developing the fatalities model for FCDO',min_month=1,max_month=999)
#except KeyError:
#    if 'devel' not in run_id:
#        warnings.warn('You are overwriting a production system')

RerunQuerysets = True

EndOfHistory = 540 # Please NOTE: Changed to last month of GED data, 2024-12 (540). Used to be last month of GED +1, unsure why. 
steps = [*range(1, 36+1, 1)] # Which steps to train and predict for
fi_steps = [1,3,6,12,36] # Which steps to present feature importances for
#steps = [1,3,6,12,36]
#fi_steps = [1,3,6,12,36]

# Specifying partitions - new for 2025
calib_partitioner_dict = {"train":(121,432),"predict":(433,480)}
test_partitioner_dict = {"train":(121,480),"predict":(481,528)}
future_partitioner_dict = {"train":(121,528),"predict":(529,540)}

calib_partitioner =  views_runs.DataPartitioner({"calib":calib_partitioner_dict})
test_partitioner =  views_runs.DataPartitioner({"test":test_partitioner_dict})
future_partitioner =  views_runs.DataPartitioner({"future":future_partitioner_dict})

Mydropbox = f'/Users/{os.getlogin()}/Dropbox (ViEWS)/ViEWS'
print('Setting Mydropbox to',Mydropbox)

# Retrieve data

In [None]:
! which python

In [None]:
# Create Markdown documentation of all querysets used
level = 'cm'
qslist = ReturnQsList(level)
document_queryset(qslist,dev_id)

In [None]:
from FetchData import fetch_cm_data_from_model_def

Datasets=fetch_cm_data_from_model_def(qslist, EndOfHistory)

# Generating predictions
Using the ViEWS3 partitioning/stepshifting syntax. Training models for A: calibration partition and B: test partition, to test out some calibration routines. Most models trained with ln_ged_sb_best as outcome.

In [None]:
for ds in Datasets:
    if 'topics' in ds['Name']:
        print(ds['df'].columns)
    

## Checking missingness and infinity values

In [None]:
N=51
for i in range(len(Datasets)):
    df = Datasets[i]['df']
    print(Datasets[i]['Name'])
    for col in df.iloc[: , :N].columns:
        print(col,len(df[col]), 'missing:', df[col].isnull().sum(), 'infinity:', np.isinf(df).values.sum())


# Specify models in ensemble

In [None]:
from ModelDefinitions import DefineEnsembleModels

ModelList = DefineEnsembleModels('cm')
    
for imodel,model in enumerate(ModelList):
    print(imodel, model['modelname'], model['data_train'])

In [None]:
ModelList

In [None]:
document_ensemble(ModelList,'sb')

In [None]:
for ds in Datasets:
    df = ds['df']
    print(ds['Name'],df.isna().sum())
    ds['df']=df.fillna(0)

In [None]:
# Loop that checks whether the model exists, retrains if not, 
# and stores the predictions if they have not been stored before for this run.
# To do: set the data_preprocessing to the function in the model dictionary

level = 'cm'
includeFuture = False
force_rewrite = True
force_retrain = True

from views_runs import Storage, StepshiftedModels
from views_partitioning.data_partitioner import DataPartitioner
from viewser import Queryset, Column
from views_runs import operations
from views_runs.run_result import RunResult
from new_markov import markov

i = 0
for model in ModelList[:]:
    if 'Markov' not in model['modelname']:
        
        modelstore = storage.Storage()
        ct = datetime.now()
        print(i, model['modelname'])
        print('Calibration partition', ct)
        model['Algorithm_text'] = str(model['algorithm'])
        model['RunResult_calib'] = RunResult.retrain_or_retrieve(
                retrain            = force_retrain,
                store              = modelstore,
                partitioner        = DataPartitioner({"calib":calib_partitioner_dict}),
                stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                dataset            = RetrieveFromList(Datasets,model['data_train']),
                queryset_name      = model['queryset'],
                partition_name     = "calib",
                timespan_name      = "train",
                storage_name       = model['modelname'] + '_calib',
                author_name        = "JED",
        )

    #    model['predstore_calib'] = level +  '_' + model['modelname'] + '_calib'
        ct = datetime.now()
        if force_rewrite:
            print(model['predstore_calib'], ', run',  run_id, 'force_rewrite=True, predicting')
            predictions_calib = model['RunResult_calib'].run.predict("calib","predict", model['RunResult_calib'].data)
            predictions_calib.forecasts.set_run(run_id)
            predictions_calib.forecasts.to_store(name=model['predstore_calib'],overwrite=True)
        else:
            print('Trying to retrieve predictions', ct)
            try:
                predictions_calib = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_calib'])
            except KeyError:
                print(model['predstore_calib'], ', run',  run_id, 'does not exist, predicting')
                predictions_calib = model['RunResult_calib'].run.predict("calib","predict", model['RunResult_calib'].data)
                predictions_calib.forecasts.set_run(run_id)
                predictions_calib.forecasts.to_store(name=model['predstore_calib'])

        ct = datetime.now()
        print('Test partition', ct)
        modelstore = storage.Storage()
        model['RunResult_test'] = RunResult.retrain_or_retrieve(
                retrain            = force_retrain,
                store              = modelstore,
                partitioner        = DataPartitioner({"test":test_partitioner_dict}),
                stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                dataset            = RetrieveFromList(Datasets,model['data_train']),
                queryset_name      = model['queryset'],
                partition_name     = "test",
                timespan_name      = "train",
                storage_name       = model['modelname'] + '_test',
                author_name        = "JED",
        )
        ct = datetime.now()
        
        if force_rewrite:
            print(model['predstore_test'], ', run',  run_id, 'force_rewrite=True, predicting')
            predictions_test = model['RunResult_test'].run.predict("test","predict", model['RunResult_test'].data)
            predictions_test.forecasts.set_run(run_id)
            predictions_test.forecasts.to_store(name=model['predstore_test'],overwrite=True)
        else:
            print('Trying to retrieve predictions', ct)
    #    model['predstore_test'] = level +  '_' + model['modelname'] + '_test'
            try:
                predictions_test = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_test'])
            except KeyError:
                print(model['predstore_test'], ', run', run_id, 'does not exist, predicting')
                predictions_test = model['RunResult_test'].run.predict("test","predict",model['RunResult_test'].data)
                predictions_test.forecasts.set_run(run_id)
                predictions_test.forecasts.to_store(name=model['predstore_test'])
        # Predictions for true future
        if includeFuture:
            ct = datetime.now()
            print('Future', ct)
            modelstore = storage.Storage()
            model['RunResult_future'] = RunResult.retrain_or_retrieve(
                    retrain            = force_retrain,
                    store              = modelstore,
                    partitioner        = DataPartitioner({"test":future_partitioner_dict}),
                    stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                    dataset            = RetrieveFromList(Datasets,model['data_train']),
                    queryset_name      = model['queryset'],
                    partition_name     = "test",
                    timespan_name      = "train",
                    storage_name       = model['modelname'] + '_future',
                    author_name        = "JED",
            )
            ct = datetime.now()
            
            if force_rewrite:
                print(model['predstore_future'], ', run',  run_id, 'force_rewrite=True, predicting')
                predictions_future = model['RunResult_future'].run.predict(EndOfHistory, model['RunResult_future'].data)
                predictions_future.forecasts.set_run(run_id)
                predictions_future.forecasts.to_store(name=model['predstore_future'],overwrite=True)
            else:
                print('Trying to retrieve predictions', ct)
                model['predstore_future'] = level +  '_' + model['modelname'] + '_f' + str(EndOfHistory)
                predictions_future.forecasts.to_store(name=model['predstore_future'])  
                
    else:
        modelstore = storage.Storage()
        ct = datetime.now()
        print(i, model['modelname'])
        print('Calibration partition', ct)
        model['Algorithm_text'] = str(model['algorithm'])
        print('Trying to retrieve predictions', ct)
        if force_retrain:
            print(model['predstore_calib'], ', run',  run_id, 'force_retrain = True, predicting')
            predictions_calib = markov.compute_markov(calib_partitioner_dict, EndOfHistory, model['depvar'], 'calib', model['algorithm'])
            predictions_calib.forecasts.set_run(run_id)
            predictions_calib.forecasts.to_store(name=model['predstore_calib'],overwrite=True)
        else:
            try:
                predictions_calib = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_calib'])
            except KeyError:
                print(model['predstore_calib'], ', run',  run_id, 'does not exist, predicting')
                predictions_calib = markov.compute_markov(calib_partitioner_dict, EndOfHistory, model['depvar'], 'calib', model['algorithm'])
                predictions_calib.forecasts.set_run(run_id)
                predictions_calib.forecasts.to_store(name=model['predstore_calib'],overwrite=True)
                
        ct = datetime.now()
        print('Test partition', ct)
        modelstore = storage.Storage()
        if force_retrain:
            print(model['predstore_test'], ', run', run_id, 'force_retrain=True, predicting')
            predictions_test = markov.compute_markov(test_partitioner_dict, EndOfHistory, model['depvar'], 'test', model['algorithm'])
            predictions_test.forecasts.set_run(run_id)
            predictions_test.forecasts.to_store(name=model['predstore_test'],overwrite=True)
        else:
            try:
                predictions_test = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_test'])
            except KeyError:
                print(model['predstore_test'], ', run', run_id, 'does not exist, predicting')
                predictions_test = markov.compute_markov(test_partitioner_dict, EndOfHistory, model['depvar'], 'test', model['algorithm'])
                predictions_test.forecasts.set_run(run_id)
                predictions_test.forecasts.to_store(name=model['predstore_test'],overwrite=True)
                
        if includeFuture:
            ct = datetime.now()
            print('Future', ct)
            modelstore = storage.Storage()
            print('Trying to retrieve predictions', ct)
            model['predstore_future'] = level +  '_' + model['modelname'] + '_f' + str(EndOfHistory)
            if force_retrain:
                print(model['predstore_future'], ', run', run_id, 'force_retrain=True, predicting')
                predictions_future = markov.compute_markov(test_partitioner_dict, EndOfHistory, model['depvar'], 'future', model['algorithm'])
                predictions_future.forecasts.set_run(run_id)
                predictions_future.forecasts.to_store(name=model['predstore_future'],overwrite=True)
            else:
                try:
                    predictions_future = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_future'])
                except KeyError:
                    print(model['predstore_future'], ', run', run_id, 'does not exist, predicting')
                    predictions_future = markov.compute_markov(test_partitioner_dict, EndOfHistory, model['depvar'], 'future', model['algorithm'])
                    predictions_future.forecasts.set_run(run_id)
                    predictions_future.forecasts.to_store(name=model['predstore_future'],overwrite=True)  
                            
        print('**************************************************************')
    i = i + 1

print('All done')

## Notes on training time for the various algorithms:

In [None]:
#These are calculated in minutes for the hh20 feature set (with about 40 features), for all 36 steps, calibration (c) and test (t) partitions, also include generating predictions, and are approximate:

#nj=12 (number of threads)
#scikit random forest:        21:13 (c), 26:20 (t) RandomForestRegressor(n_estimators=200, n_jobs=nj)
#XGB random forest:           06:02 (c), 07:51 (t) XGBRFRegressor(n_estimators=300,n_jobs=nj)
#scikit gbm:                  13:59 (c), 15:55 (t) GradientBoostingRegressor(), 
#scikit hurdle random forest: 07:32 (c), 09:49 (t) For both clf and reg: (n_estimators=200, n_jobs=nj)
#XGB hurdle xgb:              01:26 (c), 01:32 (t) For both clf and reg:                n_estimators=200,tree_method='hist',n_jobs=nj)
#scikit histgbm:              01:17 (c), 01:20 (t) HistGradientBoostingRegressor(max_iter=200)
#XGB xgb:                     01:00 (c), 01:04 (t) XGBRegressor(n_estimators=200,tree_method='hist',n_jobs=nj)
#lightgbm gbm:                00:25 (c), --    (t) LGBMRegressor(n_estimators=100,num_threads=8)

# Various helper functions and tools....

# Retrieving external forecasts

In [None]:
## Retrieve David's Markov models
## To do: rewrite the model dictionary to the new, slimmer version.
#DRList = []


#model = {
#    'modelname':   'fatalities002_Markov_glm',
#    'algorithm': [],
#    'depvar': "ln_ged_sb_dep",
#    'data_train':      'joint_narrow',
#    'queryset': 'fatalities002_joint_narrow',
#}
#DRList.append(model)

#model = {
#    'modelname':   'fatalities002_Markov_rf',
#    'algorithm': [],
#    'depvar': "ln_ged_sb_dep",
#    'data_train':      'joint_narrow',
#    'queryset': 'fatalities002_joint_narrow',
#}

#DRList.append(model)



In [None]:
#path = f'/Users/{os.getlogin()}/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/cm/preds/'

#DRList[0]['predictions_file_calib'] = path + 'markov_jointnarrow_ss_glm_calib.parquet'
#DRList[0]['predictions_file_test'] = path + 'markov_jointnarrow_ss_glm_test.parquet'
#DRList[0]['predictions_file_future'] = path + 'vmm_glm_hh20_517.csv'

#DRList[1]['predictions_file_calib'] = path + 'markov_jointnarrow_ss_rf_calib.parquet'
#DRList[1]['predictions_file_test'] = path + 'markov_jointnarrow_ss_rf_test.parquet'
#DRList[1]['predictions_file_future'] = path + 'vmm_rf_hh20_517.csv'

In [None]:
## Storing Markov models in central storage
## Retrieving dependent variable

#print('Adding depvar - CHECK FILES BEING USED FROM STORAGE ARE SUITABLE!')
#target_calib = pd.DataFrame.forecasts.read_store('cm_fatalities002_conflicthistory_rf_calib', run=run_id)['ln_ged_sb_dep']
#target_test = pd.DataFrame.forecasts.read_store('cm_fatalities002_conflicthistory_rf_test', run=run_id)['ln_ged_sb_dep']
#level = 'cm'
#for model in DRList:
#    df_calib = pd.read_parquet(model['predictions_file_calib'])
##    df_calib.rename(columns={'target_month_id':'month_id'}, inplace=True)
##    df_calib.set_index(['month_id', 'country_id'], inplace=True)

#    df_test = pd.read_parquet(model['predictions_file_test'])
##    df_test.rename(columns={'target_month_id':'month_id'}, inplace=True)
##    df_calib.set_index(['month_id', 'country_id'], inplace=True)

##    df_future = pd.read_csv(model['predictions_file_future'],index_col=['month_id','country_id'])
#    df_calib['ln_ged_sb_dep'] = target_calib
#    df_test['ln_ged_sb_dep'] = target_test
##    df_future['ln_ged_sb_dep'] = np.nan # Empty dependent variable column for consistency/required by prediction storage function
#    stored_modelname = level + '_' + model['modelname'] + '_calib'
#    df_calib.forecasts.set_run(run_id)
#    df_calib.forecasts.to_store(name=stored_modelname, overwrite=True)
#    stored_modelname = level + '_' + model['modelname'] + '_test'
#    df_test.forecasts.set_run(run_id)
#    df_test.forecasts.to_store(name=stored_modelname, overwrite=True)    

In [None]:
print('All done!')