
# ViEWS 3 constituent models 
## ViEWS production system, cm level


This notebook trains a set of regression models for use in the monthly updated ViEWS predicting fatalities ensemble

The notebook does the following: 
1. Retrieves data through querysets and stores in DataSets, a list of dictionaries
2. Specifies the metadata of a number of models, stores in ModelList, a list of dictionaries
3. Trains the models in ModelList, stores the trained objects in model storage and prediction storage
4. Saves part of ModelList as csv and the rest as pickles


## Importing modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from xgboost import XGBRFRegressor, XGBRFClassifier

from lightgbm import LGBMClassifier, LGBMRegressor

# Views 3
from viewser.operations import fetch
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
#import views_dataviz
from views_runs import storage
from views_runs.storage import store, retrieve, fetch_metadata

from views_forecasts.extensions import *

# Other packages
import pickle as pkl

# Packages from viewsforecasting repository

#from Ensembling import CalibratePredictions, RetrieveStoredPredictions, mean_sd_calibrated, gam_calibrated
import os
import sys
sys.path.append('../')
sys.path.append('../Tools')
sys.path.append('../Intermediates')
from FetchData import FetchData, RetrieveFromList, document_queryset, ReturnQsList, document_ensemble
from ViewsEstimators import *


Baseline queryset pushed forward from the fatalities002, new updated version.
 Topics has been slightly changed to reflect the new time lag. Note in the future we should be able to tlag by 1 only as for example, by September 7th we should have all data for October. So technically ahead of the ucdp update schedule. For right now, though, we have data through July so given that we are running the model in October I lagged the value by 3 instead of 1. I converted the 1 year lag to 15 months, and then replaced the running average by 3 month lag with stock over 12. Note that the running average is not the exact version as is calculated by Rauh and Muller team but a stand in. Additional variables will be added.
 Google trends has similarly been lagged to reflect the last available data is August. There might be something wrong with my querysets where the NAs are getting autofilled with 0s when querying. So, I am setting the google_index value by tlag 2. 
 Internet usage is available at count

## Common parameters

In [None]:
#!conda list | grep views

In [None]:
# To do:
# find out why and where missingness occurs

In [3]:
# Common parameters:
dev_id = 'fat_dev_mc_media'
run_id = dev_id

# Generating a new run if necessary

#try:
 #  ViewsMetadata().new_run(name=run_id,description='Developing the fatalities model for FCDO',min_month=1,max_month=999)
#except KeyError:
  #if 'devel' not in run_id:
     #   warnings.warn('You are overwriting a production system')

RerunQuerysets = True

FutureStart = 515
steps = [*range(1, 36+1, 1)] # Which steps to train and predict for
fi_steps = [1,3,6,12,36] # Which steps to present feature importances for
#steps = [1,3,6,12,36]
#fi_steps = [1,3,6,12,36]

# Specifying partitions
calib_partitioner_dict = {"train":(121,396),"predict":(397,444)}
test_partitioner_dict = {"train":(121,444),"predict":(445,492)}
future_partitioner_dict = {"train":(121,492),"predict":(493,504)}
calib_partitioner =  views_runs.DataPartitioner({"calib":calib_partitioner_dict})
test_partitioner =  views_runs.DataPartitioner({"test":test_partitioner_dict})
future_partitioner =  views_runs.DataPartitioner({"future":future_partitioner_dict})

Mydropbox = f'/Users/{os.getlogin()}/Dropbox (ViEWS)/ViEWS'
print('Setting Mydropbox to',Mydropbox)

Setting Mydropbox to /Users/malika/Dropbox (ViEWS)/ViEWS


# Retrieve data

In [4]:
# Create Markdown documentation of all querysets used
level = 'cm'
qslist = ReturnQsList(level)
document_queryset(qslist,dev_id)

 .      o   fat_dev_mc_media_baseline; A dataset with 6 columns, with data between t 1 and 852. (213 units)
 .      o   fat_dev_mc_media_topics_stub; A dataset with 60 columns, with data between t 1 and 852. (213 units)
 .      o   fat_dev_mc_media_google_internet_stub; A dataset with 13 columns, with data between t 1 and 852. (213 units)
 .     .    Model:  fat_dev_mc_media_baseline
Model:  fat_dev_mc_media_topics
Model:  fat_dev_mc_media_google_internet
Model:  fat_dev_mc_media_all_features


In [5]:
from FetchData import fetch_cm_data_from_model_def

Datasets=fetch_cm_data_from_model_def(qslist)

no. qss 4
defined ['fat_dev_mc_media_baseline', 'fat_dev_mc_media_topics', 'fat_dev_mc_media_google_internet', 'fat_dev_mc_media_all_features']
model ['fat_dev_mc_media_baseline', 'fat_dev_mc_media_topics', 'fat_dev_mc_media_all_features', 'fat_dev_mc_media_google_internet']
 .    media_baseline: A dataset with 6 columns, with data between t = 1 and 852; 213 units.
 .    media_topics: A dataset with 66 columns, with data between t = 1 and 852; 213 units.
 .    media_all_features: A dataset with 79 columns, with data between t = 1 and 852; 213 units.
 .    media_google_internet: A dataset with 19 columns, with data between t = 1 and 852; 213 units.
len datasets 4


# Generating predictions
Using the ViEWS3 partitioning/stepshifting syntax. Training models for A: calibration partition and B: test partition, to test out some calibration routines. Most models trained with ln_ged_sb_best as outcome.

In [6]:
dev_id

'fat_dev_mc_media'

In [7]:
Datasets[0]['df'].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ln_ged_sb_dep,ln_ged_sb,wdi_sp_pop_totl,decay_ged_sb_5,decay_ged_os_5,splag_1_decay_ged_sb_5
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,0.0,0.0,780153.0,0.0,0.0,0.0
1,2,0.0,0.0,359531.0,0.0,0.0,0.0
1,3,0.0,0.0,1084744.0,0.0,0.0,0.0
1,4,0.0,0.0,15182611.0,0.0,0.0,0.0
1,5,0.0,0.0,155525.0,0.0,0.0,0.0


In [8]:
from views_runs import ModelMetadata 
help(ModelMetadata)

Help on class ModelMetadata in module views_schema.models:

class ModelMetadata(pydantic.main.BaseModel)
 |  ModelMetadata(*, author: str, queryset_name: str, train_start: int, train_end: int, steps: Optional[List[int]] = None, training_date: datetime.datetime) -> None
 |  
 |  ModelMetadata
 |  
 |  Data used to organize model objects.
 |  
 |  parameters:
 |      author (str): Name of the user that authored the model object.
 |      queryset_name (str): Name of the queryset used to train the model
 |      train_start (int): Month identifier for training start date
 |      train_start (int): Month identifier for training end date
 |      training_date (datetime.datetime): Timestamp for training date (use datetime.datetime.now())
 |  
 |  example:
 |  
 |      # Instantiate the class with values
 |  
 |      my_metadata = ModelMetadata(
 |          author = "my_name",
 |          queryset_name = "my_queryset",
 |          train_start = 1,
 |          train_end = 300,
 |          steps 




## Checking missingness and infinity values

In [9]:
N=51
for i in range(len(Datasets)):
    df = Datasets[i]['df']
    print(Datasets[i]['Name'])
    for col in df.iloc[: , :N].columns:
        print(col,len(df[col]), 'missing:', df[col].isnull().sum(), 'infinity:', np.isinf(df).values.sum())


media_baseline
ln_ged_sb_dep 158230 missing: 0 infinity: 0
ln_ged_sb 158230 missing: 0 infinity: 0
wdi_sp_pop_totl 158230 missing: 11 infinity: 0
decay_ged_sb_5 158230 missing: 0 infinity: 0
decay_ged_os_5 158230 missing: 0 infinity: 0
splag_1_decay_ged_sb_5 158230 missing: 0 infinity: 0
media_topics
ln_ged_sb_dep 158230 missing: 0 infinity: 0
ln_ged_sb 158230 missing: 0 infinity: 0
wdi_sp_pop_totl 158230 missing: 11 infinity: 0
topic0_religion_t3 158230 missing: 5 infinity: 0
topic0_religion_t15 158230 missing: 11 infinity: 0
topic1_politics_t3 158230 missing: 5 infinity: 0
topic1_politics_t15 158230 missing: 11 infinity: 0
topic2_sanctions_3 158230 missing: 5 infinity: 0
topic2_sanctions_t15 158230 missing: 11 infinity: 0
topic3_life_t3 158230 missing: 5 infinity: 0
topic3_life_t15 158230 missing: 11 infinity: 0
topic4_energy_t3 158230 missing: 5 infinity: 0
topic4_energy_t15 158230 missing: 11 infinity: 0
topic5_media_t3 158230 missing: 5 infinity: 0
topic5_media_t15 158230 missing:

# Specify models in ensemble

In [10]:
from ModelDefinitions_media import DefineEnsembleModels

ModelList = DefineEnsembleModels('cm')
    
for imodel,model in enumerate(ModelList):
    print(imodel, model['modelname'], model['data_train'])

0 fat_dev_mc_media_baseline_xgbrf media_baseline
1 fat_dev_mc_media_topics_xgbrf media_topics
2 fat_dev_mc_media_google_internet_hurdle media_google_internet
3 fat_dev_mc_media_all_features_xgbrf media_all_features


In [11]:
ModelList

[{'modelname': 'fat_dev_mc_media_baseline_xgbrf',
  'algorithm': XGBRFRegressor(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bytree=None,
                 early_stopping_rounds=None, enable_categorical=False,
                 eval_metric=None, gamma=None, gpu_id=None, grow_policy=None,
                 importance_type=None, interaction_constraints=None, max_bin=None,
                 max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
                 max_leaves=None, min_child_weight=None, missing=nan,
                 monotone_constraints=None, n_estimators=300, n_jobs=4,
                 num_parallel_tree=None, objective='reg:squarederror',
                 predictor=None, random_state=None, reg_alpha=None,
                 sampling_method=None, scale_pos_weight=None, ...),
  'depvar': 'ln_ged_sb_dep',
  'data_train': 'media_baseline',
  'queryset': 'fat_dev_mc_media_baseline',
  'preprocessing': 'float_it',
  'level': 'c

In [12]:
document_ensemble(ModelList,'sb')

0 fat_dev_mc_media_baseline_xgbrf media_baseline
1 fat_dev_mc_media_topics_xgbrf media_topics
2 fat_dev_mc_media_google_internet_hurdle media_google_internet
3 fat_dev_mc_media_all_features_xgbrf media_all_features


In [13]:
# Loop that checks whether the model exists, retrains if not, 
# and stores the predictions if they have not been stored before for this run.
# To do: set the data_preprocessing to the function in the model dictionary

level = 'cm'
includeFuture = False

from views_runs import Storage, StepshiftedModels
from views_partitioning.data_partitioner import DataPartitioner
from viewser import Queryset, Column
from views_runs import operations
from views_runs.run_result import RunResult

i = 0
for model in ModelList:
    if model['algorithm'] != 'Rscript':
        force_retrain = True
        modelstore = storage.Storage()
        ct = datetime.now()
        print(i, model['modelname'])
        print('Calibration partition', ct)
        model['Algorithm_text'] = str(model['algorithm'])
        model['RunResult_calib'] = RunResult.retrain_or_retrieve(
                retrain            = force_retrain,
                store              = modelstore,
                partitioner        = DataPartitioner({"calib":calib_partitioner_dict}),
                stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                dataset            = RetrieveFromList(Datasets,model['data_train']),
                queryset_name      = model['queryset'],
                partition_name     = "calib",
                timespan_name      = "train",
                storage_name       = model['modelname'] + '_calib',
                author_name        = "HH",
        )

    #    model['predstore_calib'] = level +  '_' + model['modelname'] + '_calib'
        ct = datetime.now()
        print('Trying to retrieve predictions', ct)
        try:
            predictions_calib = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_calib'])
        except KeyError:
            print(model['predstore_calib'], ', run',  run_id, 'does not exist, predicting')
            predictions_calib = model['RunResult_calib'].run.predict("calib","predict", model['RunResult_calib'].data)
            predictions_calib.forecasts.set_run(run_id)
            predictions_calib.forecasts.to_store(name=model['predstore_calib'])

        ct = datetime.now()
        print('Test partition', ct)
        modelstore = storage.Storage()
        model['RunResult_test'] = RunResult.retrain_or_retrieve(
                retrain            = force_retrain,
                store              = modelstore,
                partitioner        = DataPartitioner({"test":test_partitioner_dict}),
                stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                dataset            = RetrieveFromList(Datasets,model['data_train']),
                queryset_name      = model['queryset'],
                partition_name     = "test",
                timespan_name      = "train",
                storage_name       = model['modelname'] + '_test',
                author_name        = "HH",
        )
        ct = datetime.now()
        print('Trying to retrieve predictions', ct)
    #    model['predstore_test'] = level +  '_' + model['modelname'] + '_test'
        try:
            predictions_test = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_test'])
        except KeyError:
            print(model['predstore_test'], ', run', run_id, 'does not exist, predicting')
            predictions_test = model['RunResult_test'].run.predict("test","predict",model['RunResult_test'].data)
            predictions_test.forecasts.set_run(run_id)
            predictions_test.forecasts.to_store(name=model['predstore_test'])
        # Predictions for true future
        if includeFuture:
            ct = datetime.now()
            print('Future', ct)
            modelstore = storage.Storage()
            model['RunResult_future'] = RunResult.retrain_or_retrieve(
                    retrain            = force_retrain,
                    store              = modelstore,
                    partitioner        = DataPartitioner({"test":future_partitioner_dict}),
                    stepshifted_models = StepshiftedModels(model['algorithm'], steps, model['depvar']),
                    dataset            = RetrieveFromList(Datasets,model['data_train']),
                    queryset_name      = model['queryset'],
                    partition_name     = "test",
                    timespan_name      = "train",
                    storage_name       = model['modelname'] + '_future',
                    author_name        = "HH",
            )
            ct = datetime.now()
            print('Trying to retrieve predictions', ct)
            model['predstore_future'] = level +  '_' + model['modelname'] + '_f' + str(FutureStart)
            try:
                predictions_future = pd.DataFrame.forecasts.read_store(run=run_id, name=model['predstore_future'])
            except KeyError:
                print(model['predstore_future'], ', run', run_id, 'does not exist, predicting')
                predictions_future = model['RunResult_future'].run.future_point_predict(FutureStart,model['RunResult_future'].data)
                predictions_future.forecasts.set_run(run_id)
                predictions_future.forecasts.to_store(name=model['predstore_future'])  
        print('**************************************************************')
    i = i + 1

print('All done')

0 fat_dev_mc_media_baseline_xgbrf
Calibration partition 2022-11-02 12:12:27.285979
 * == Performing a run: "fat_dev_mc_media_baseline_xgbrf_calib" == * 
Model object named "fat_dev_mc_media_baseline_xgbrf_calib" with equivalent metadata already exists.
Retrain is true, overwriting "fat_dev_mc_media_baseline_xgbrf_calib"
Training model(s)...
Storing "fat_dev_mc_media_baseline_xgbrf_calib"
Trying to retrieve predictions 2022-11-02 12:13:29.347753
pr_51_cm_fat_dev_mc_media_baseline_xgbrf_calib.parquet
Test partition 2022-11-02 12:13:31.604549
 * == Performing a run: "fat_dev_mc_media_baseline_xgbrf_test" == * 
Model object named "fat_dev_mc_media_baseline_xgbrf_test" with equivalent metadata already exists.
Retrain is true, overwriting "fat_dev_mc_media_baseline_xgbrf_test"
Training model(s)...
Storing "fat_dev_mc_media_baseline_xgbrf_test"
Trying to retrieve predictions 2022-11-02 12:14:47.316254
pr_51_cm_fat_dev_mc_media_baseline_xgbrf_test.parquet
**************************************

In [14]:
# Exploring the future predictions


predictions_test.xs(246,level=1).tail()

Unnamed: 0_level_0,ln_ged_sb_dep,ln_ged_sb,wdi_sp_pop_totl,_wdi_sp_pop_totl,topic0_religion_t3,topic0_religion_t15,topic1_politics_t3,topic1_politics_t15,topic2_sanctions_3,topic2_sanctions_t15,...,step_pred_33,step_pred_34,step_pred_35,step_pred_36,step_pred_4,step_pred_5,step_pred_6,step_pred_7,step_pred_8,step_pred_9
month_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
488,3.218876,3.218876,11062113.0,11062113.0,12.75218,12.6504,2.13735,6.15594,0.10543,0.08998,...,2.532329,2.659568,2.357273,2.248837,1.768521,1.43068,1.381671,1.878586,1.588895,2.026045
489,0.693147,0.693147,11062113.0,11062113.0,15.16145,14.22915,3.20609,4.83517,0.09004,0.09765,...,2.740053,2.668735,2.059299,2.554636,2.485806,1.304931,1.503598,1.312958,1.721088,1.863862
490,1.609438,1.609438,11062113.0,11062113.0,21.0779,13.70679,4.36394,3.63354,0.18347,0.11137,...,2.474459,2.767469,2.082304,2.309595,1.694217,2.498828,1.480119,1.355356,1.364338,1.879686
491,2.484907,2.484907,11062113.0,11062113.0,17.211529,13.411079,3.14073,4.43855,0.15888,0.06939,...,2.897392,2.581539,2.242971,2.358017,2.40345,1.620644,2.632906,1.425801,1.264489,1.669755
492,0.0,0.0,11062113.0,11062113.0,14.946149,17.431931,2.81162,2.98259,0.06183,0.26696,...,2.849369,2.834021,1.997118,2.378682,2.567327,2.10424,1.634651,2.268459,1.117458,1.397199


## Notes on training time for the various algorithms:

In [None]:
#These are calculated in minutes for the hh20 feature set (with about 40 features), for all 36 steps, calibration (c) and test (t) partitions, also include generating predictions, and are approximate:

#nj=12 (number of threads)
#scikit random forest:        21:13 (c), 26:20 (t) RandomForestRegressor(n_estimators=200, n_jobs=nj)
#XGB random forest:           06:02 (c), 07:51 (t) XGBRFRegressor(n_estimators=300,n_jobs=nj)
#scikit gbm:                  13:59 (c), 15:55 (t) GradientBoostingRegressor(), 
#scikit hurdle random forest: 07:32 (c), 09:49 (t) For both clf and reg: (n_estimators=200, n_jobs=nj)
#XGB hurdle xgb:              01:26 (c), 01:32 (t) For both clf and reg:                n_estimators=200,tree_method='hist',n_jobs=nj)
#scikit histgbm:              01:17 (c), 01:20 (t) HistGradientBoostingRegressor(max_iter=200)
#XGB xgb:                     01:00 (c), 01:04 (t) XGBRegressor(n_estimators=200,tree_method='hist',n_jobs=nj)
#lightgbm gbm:                00:25 (c), --    (t) LGBMRegressor(n_estimators=100,num_threads=8)

# Various helper functions and tools....

In [15]:
!conda list | grep views-forecasts

views-forecasts           0.5.3                    pypi_0    pypi


# Retrieving external forecasts

In [16]:
# Retrieve David's Markov models
# To do: rewrite the model dictionary to the new, slimmer version.
DRList = []


model = {
    'modelname':   'fat_hh20_Markov_glm',
    'algorithm': [],
    'depvar': "ln_ged_sb_dep",
    'data_train':      'hh20',
    'queryset': 'hh_20_features',
}
DRList.append(model)

model = {
    'modelname':   'fat_hh20_Markov_rf',
    'algorithm': [],
    'depvar': "ln_ged_sb_dep",
    'data_train':      'hh20',
    'queryset': 'hh_20_features',
}

DRList.append(model)



In [21]:
DRList

[{'modelname': 'fat_hh20_Markov_glm',
  'algorithm': [],
  'depvar': 'ln_ged_sb_dep',
  'data_train': 'hh20',
  'queryset': 'hh_20_features',
  'predictions_file_calib': '/Users/malika/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/cm/preds/vmm_glm_hh20_0125_alt_calib.csv',
  'predictions_file_test': '/Users/malika/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/cm/preds/vmm_glm_hh20_0125_alt_test.csv',
  'predictions_file_future': '/Users/malika/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/cm/preds/vmm_glm_hh20_506.csv'},
 {'modelname': 'fat_hh20_Markov_rf',
  'algorithm': [],
  'depvar': 'ln_ged_sb_dep',
  'data_train': 'hh20',
  'queryset': 'hh_20_features',
  'predictions_file_calib': '/Users/malika/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/cm/preds/vmm_rf_hh20_0125_alt_calib.csv',
  'predictions_file_test': '/Users/malika/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/cm/preds/vmm_rf_hh20_01

In [17]:
path = f'/Users/{os.getlogin()}/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/cm/preds/'

DRList[0]['predictions_file_calib'] = path + 'vmm_glm_hh20_0125_alt_calib.csv'
DRList[0]['predictions_file_test'] = path + 'vmm_glm_hh20_0125_alt_test.csv'
DRList[0]['predictions_file_future'] = path + 'vmm_glm_hh20_506.csv'

DRList[1]['predictions_file_calib'] = path + 'vmm_rf_hh20_0125_alt_calib.csv'
DRList[1]['predictions_file_test'] = path + 'vmm_rf_hh20_0125_alt_test.csv'
DRList[1]['predictions_file_future'] = path + 'vmm_rf_hh20_505.csv'

In [18]:
print(path)

/Users/malika/Dropbox (ViEWS)/ViEWS/Projects/PredictingFatalities/Predictions/cm/preds/


In [19]:

for model in ModelList:
    print(model['modelname'])

fat_dev_mc_media_baseline_xgbrf
fat_dev_mc_media_topics_xgbrf
fat_dev_mc_media_google_internet_hurdle
fat_dev_mc_media_all_features_xgbrf


In [22]:
# Storing Markov models in central storage
# Retrieving dependent variable
target_calib = pd.DataFrame.forecasts.read_store('cm_fat_dev_mc_media_baseline_xgbrf_calib', run=run_id)['ln_ged_sb_dep']
target_test = pd.DataFrame.forecasts.read_store('cm_fat_dev_mc_media_baseline_xgbrf_test', run=run_id)['ln_ged_sb_dep']
level = 'cm'
for model in DRList:
    df_calib = pd.read_csv(model['predictions_file_calib'],index_col=['month_id','country_id'])
    df_test = pd.read_csv(model['predictions_file_test'],index_col=['month_id','country_id'])
    df_future = pd.read_csv(model['predictions_file_future'],index_col=['month_id','country_id'])
    df_calib['ln_ged_sb_dep'] = target_calib
    df_test['ln_ged_sb_dep'] = target_test
    df_future['ln_ged_sb_dep'] = np.nan # Empty dependent variable column for consistency/required by prediction storage function
    stored_modelname = level + '_' + model['modelname'] + '_calib'
    df_calib.forecasts.set_run(run_id)
    df_calib.forecasts.to_store(name=stored_modelname, overwrite=True)
    stored_modelname = level + '_' + model['modelname'] + '_test'
    df_test.forecasts.set_run(run_id)
    df_test.forecasts.to_store(name=stored_modelname, overwrite=True)    

pr_51_cm_fat_dev_mc_media_baseline_xgbrf_calib.parquet
pr_51_cm_fat_dev_mc_media_baseline_xgbrf_test.parquet


In [23]:
ModelList

[{'modelname': 'fat_dev_mc_media_baseline_xgbrf',
  'algorithm': XGBRFRegressor(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bytree=None,
                 early_stopping_rounds=None, enable_categorical=False,
                 eval_metric=None, gamma=None, gpu_id=None, grow_policy=None,
                 importance_type=None, interaction_constraints=None, max_bin=None,
                 max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
                 max_leaves=None, min_child_weight=None, missing=nan,
                 monotone_constraints=None, n_estimators=300, n_jobs=4,
                 num_parallel_tree=None, objective='reg:squarederror',
                 predictor=None, random_state=None, reg_alpha=None,
                 sampling_method=None, scale_pos_weight=None, ...),
  'depvar': 'ln_ged_sb_dep',
  'data_train': 'media_baseline',
  'queryset': 'fat_dev_mc_media_baseline',
  'preprocessing': 'float_it',
  'level': 'c