# Benchmark model generation

In [None]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
import os


# Views 3
import views_runs
from viewser.operations import fetch
from views_forecasts.extensions import *
from viewser import Queryset, Column


In [None]:
# Common parameters:

dev_id = 'Fatalities002'
run_id = 'Fatalities002'
EndOfHistory = 508
level = 'cm'
get_future = False

username = os.getlogin()

steps = [*range(1, 36+1, 1)] # Which steps to train and predict for

fi_steps = [1,3,6,12,36]
# Specifying partitions

calib_partitioner_dict = {"train":(121,396),"predict":(397,444)}
test_partitioner_dict = {"train":(121,444),"predict":(445,492)}
future_partitioner_dict = {"train":(121,492),"predict":(493,504)}
calib_partitioner =  views_runs.DataPartitioner({"calib":calib_partitioner_dict})
test_partitioner =  views_runs.DataPartitioner({"test":test_partitioner_dict})
future_partitioner =  views_runs.DataPartitioner({"future":future_partitioner_dict})

Mydropbox = f'/Users/{username}/Dropbox (ViEWS)/ViEWS/'
overleafpath = f'/Users/{username}/Dropbox (ViEWS)/Apps/Overleaf/ViEWS predicting fatalities/Tables/'



print('Dropbox path set to',Mydropbox)
print('Overleaf path set to',overleafpath)

## cm level

### Based on constituent models

Short version, 22 models: 
1 "draw"
from each of 22 constituent models

Long version, 440 models:
20 "draws" from each of 22 constituent models, using predictions for adjacent steps (from s-4 to s+6). Some duplications to weight the most proximate steps more.



In [None]:

# Fatalities002 stuff - contains the list of the current fatalities002 ensemble models

from ModelDefinitions import DefineEnsembleModels

ModelList = DefineEnsembleModels(level)
    
i = 0
for model in ModelList:
    print(i, model['modelname'], model['data_train'])
    i = i + 1

In [None]:
# Retrieving the predictions for calibration and test partitions
# The ModelList contains the predictions organized by model
from Ensembling import CalibratePredictions, RetrieveStoredPredictions, mean_sd_calibrated, gam_calibrated

ModelList = RetrieveStoredPredictions(ModelList, steps, EndOfHistory, dev_id, level, get_future)

ModelList = CalibratePredictions(ModelList, EndOfHistory, steps)

In [None]:
ModelList[0]['predictions_test_df']
ModelList[0]

In [None]:
# Dataframe with actuals
df_actuals = pd.DataFrame(ModelList[0]['predictions_test_df']['ln_ged_sb_dep'])
print(df_actuals.head())
print(df_actuals.tail())


In [None]:
def reshape_df(df, draw):
    ''' Drops steps we will not need in the benchmark model. 
    Another round of drops are done below '''
    steps_to_drop = ['ln_ged_sb_dep','step_pred_21','step_pred_22','step_pred_23','step_pred_24',
                     'step_pred_25','step_pred_26','step_pred_27','step_pred_28','step_pred_29','step_pred_30',
                     'step_pred_31','step_pred_32','step_pred_33','step_pred_34','step_pred_35','step_pred_36',]
    df = df.drop(steps_to_drop,axis=1)
    df.reset_index(inplace=True)
    df['draw'] = draw
    df_long = pd.wide_to_long(df, 'step_pred_', i = ['month_id', 'country_id', 'draw'], j = 'step')
    return(df_long)
    
model_draw = 0
df = ModelList[model_draw]['predictions_test_df'].copy()
df_cm_results_long = reshape_df(df,model_draw)
print('Starting with model/draw',model_draw, model['modelname'])
print(df_cm_results_long.describe())


for model in ModelList[1:]:
    model_draw += 1
    print('Appending model/draw',model_draw, model['modelname'])
    df = ModelList[model_draw]['predictions_test_df'].copy()
    df_reshaped = reshape_df(df,model_draw)
    df_cm_results_long = pd.concat([df_cm_results_long ,df_reshaped], axis=0)
    

In [None]:
df_cm_results_long['prediction'] = np.round_(np.expm1(df_cm_results_long['step_pred_'])).astype('int32')
df_cm_results_long.drop(columns=['step_pred_'], inplace=True)
# Results file in long format
print(df_cm_results_long.describe())
print(df_cm_results_long.tail())

print(df_cm_results_long.loc[492].describe())

In [None]:
84040/(22*20)

In [None]:
# Extending by copying adjacent steps

df_cm_results_extended=df_cm_results_long.copy()
print(df_cm_results_extended.describe())
print(df_cm_results_extended.head())

In [None]:
df_cm_final_extended = df_cm_results_extended.copy()

def make_dfcopy(df_in, step, shifted_step, repetition):
    ''' Makes a 'copy' of the df with a shifted step '''
#    print(step, shifted_step, repetition)
    df = pd.DataFrame(df_in[df_in.index.get_level_values('step').isin([shifted_step])]).copy()
    df.reset_index(inplace = True)
    df['step'].replace(shifted_step, step, inplace = True)
#    print(df.describe())
    df['draw'] = (df['draw'] + len(ModelList) * repetition)
#    print(df.describe())
    df.set_index(['month_id', 'country_id','draw', 'step'], inplace=True)
    return(df)
    

for step in range(3,14+1):     
#    print(80*'*')
    print('Step', step, '-- Original dataframe for step', step, 'is:')
    df = pd.DataFrame(df_cm_results_extended[df_cm_results_extended.index.get_level_values('step').isin([step])])
#    print(df.head(3))
    repetition = 1
    for shift in [(-4,1),(-3,1),(-2,2),(-1,3),(0,2),(1,3),(2,2),(3,2),(4,1),(5,1),(6,1)]:
        for copy in range(1,shift[1]+1):
            shifted_step = step+shift[0]
            if shifted_step < 1:
                shifted_step = step
            step_list = [shifted_step]
            df = make_dfcopy(df_in = df_cm_results_extended,step = step, shifted_step = shifted_step, repetition = repetition)
            repetition += 1
            df_cm_final_extended = pd.concat([df_cm_final_extended, df])

In [None]:
help(make_dfcopy)

In [None]:
df_cm_results_long.loc[492].describe()

In [None]:
df_cm_final_extended.loc[492].describe()

In [None]:
df_cm_final_extended.loc[492].tail()

In [None]:
# Pruning the original down to steps 3-14
steps_to_keep = range(3,15)

df_cm_results_long_pruned = df_cm_results_long[df_cm_results_long.index.get_level_values('step').isin(steps_to_keep)]
df_cm_results_long_pruned.tail()

### cm last historical values benchmark model

In [None]:
qs = (Queryset("benchmark_cm", "country_month")

   # target variable
   .with_column(Column("ged_sb", from_table="ged2_cm", from_column="ged_sb_best_sum_nokgi")
                .transform.missing.fill()
                .transform.missing.replace_na()
                )


   .with_theme("benchmark")
   .describe("""Data for empirical benchmark model, cm level


            """)
   )

#queryset = Queryset("name", "loa") # if not already defined
column = "ged_sb_best_sum_nokgi"
table = "ged2_cm"
lags=range(1,49)
for lag in lags: 
    qs = qs.with_column(Column(column+'_' + str(lag), from_table = table, from_column=column)
                        .transform.missing.replace_na()
                        .transform.temporal.tlag(lag)
                       )
df_cm_historical_values = qs.publish().fetch()
df_cm_historical_values = df_cm_historical_values.loc[445:492]

df_cm_historical_values.describe()

In [None]:
# Creating predictions for test partition
number_of_lags = 45
lags=range(1,number_of_lags)
maxstep = 14
df_list_bystep = []
for step in range(3,maxstep+1):
    print('step:',step,'lag:',lag, 'draw:',draw)
    draw = 1
    df_list = []
    for lag in lags:
        number_of_repetitions = number_of_lags+1-lag
#        print('lag:',lag,'repetitions:',number_of_repetitions)
#        print('step:',step,'lag:',lag, 'draw:',draw)
        for repetition in range(1,number_of_repetitions):
            lagged_col = 'ged_sb_best_sum_nokgi_' + str(lag)
            df = pd.DataFrame(df_cm_historical_values[lagged_col].copy())
            df.reset_index(inplace=True)
            df['prediction'] = df[lagged_col]
#            print(df.head())
            df.drop(columns=[lagged_col], inplace=True)
            df['step'] = step
            df['draw'] = draw
            df.set_index(['month_id', 'country_id','draw', 'step'], inplace=True)
            df_list.append(df)
#            if draw == 1 and step == 1:
#                df_cm_predictions_historical_values = df.copy()
#            else:
#                df_cm_predictions_historical_values = pd.concat([df_cm_predictions_historical_values,df])
            draw = draw + 1
    df_cm_predictions_lag = pd.concat(df_list)
    df_list_bystep.append(df_cm_predictions_lag)
    
df_cm_predictions_historical_values = pd.concat(df_list_bystep) 

df_cm_predictions_historical_values.describe()

In [None]:
df_cm_predictions_historical_values

In [None]:
# Export to csv
filename = Mydropbox + 'Prediction_competition_2023/' + 'cm_actuals.csv'
df_actuals.to_csv(filename)
filename = Mydropbox + 'Prediction_competition_2023/' + 'cm_benchmark_ensemble_22.parquet'
df_cm_results_long_pruned.to_parquet(filename)
filename = Mydropbox + 'Prediction_competition_2023/' + 'cm_benchmark_ensemble_440.parquet'
df_cm_final_extended.to_parquet(filename)
filename = Mydropbox + 'Prediction_competition_2023/' + 'cm_benchmark_historical_values.parquet'
df_cm_predictions_historical_values.to_parquet(filename)

# pgm benchmark

In [None]:
qs = (Queryset("benchmark_pgm", "priogrid_month")

   # target variable
   .with_column(Column("ged_sb", from_table="ged2_pgm", from_column="ged_sb_best_sum_nokgi")
                .transform.missing.fill()
                .transform.missing.replace_na()
                )

    .with_column(Column("ged_os", from_table="ged2_pgm", from_column="ged_os_best_sum_nokgi")
                 .transform.missing.fill()
                 .transform.missing.replace_na()
                 )

    .with_column(Column("ged_ns", from_table="ged2_pgm", from_column="ged_ns_best_sum_nokgi")
                 .transform.missing.fill()
                 .transform.missing.replace_na()
                 )

   # Spatial lag
   .with_column(Column("splag_1_1_sb_1", from_table="ged2_pgm", from_column="ged_sb_best_sum_nokgi")
                .transform.missing.replace_na()
                .transform.bool.gte(1)
                .transform.temporal.time_since()
                .transform.temporal.decay(24)
                .transform.spatial.lag(1, 1, 0, 0)
                .transform.missing.replace_na()
                )
   # timelags 1-12 of target variable
   .with_column(Column("ged_sb_tlag_1", from_table="ged2_pgm",
                       from_column="ged_sb_best_sum_nokgi")
                .transform.missing.fill()
                .transform.temporal.tlag(1)
                .transform.missing.fill()
                )

   .with_theme("benchmark")
   .describe("""Data for empirical benchmark model, pgm level


            """)
   )

data = qs.publish().fetch()

In [None]:
data