In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
import pymc as pm
import arviz as az
import dill
import os



In [2]:
from thesis_tools.utils.data import *
from thesis_tools.models.frequentist import *
from thesis_tools.models.bayesian_univariate_time_series import *

In [3]:
df = read_panel_data(observations_threshold=0)

In [4]:
df['group'].unique()

array(['Alps', 'Asian Islands', 'Australia', 'Brazil', 'British Islands',
       'Canada', 'China', 'France', 'Germany', 'India', 'Israel + Turkey',
       'Italy', 'Japan', 'Russia', 'Scandinavia', 'South Korea',
       'Southeast Asia', 'U.S.'], dtype=object)

In [5]:
# filter only on the group of interest
temp = df[df['group'] == 'U.S.']
# only certain columns are needed
temp = temp[['group', 'year', 'N_net_worth',
       'log_change_gdp_pc', 'log_change_CAC40', 'log_change_DAX',
       'log_change_FTSE100', 'log_change_MOEX', 'log_change_MSCI',
       'log_change_NIFTY', 'log_change_OMX40', 'log_change_SPX',
       'log_change_SSE']]
temp

Unnamed: 0,group,year,N_net_worth,log_change_gdp_pc,log_change_CAC40,log_change_DAX,log_change_FTSE100,log_change_MOEX,log_change_MSCI,log_change_NIFTY,log_change_OMX40,log_change_SPX,log_change_SSE
401,U.S.,2001,182,,,,,,,,,,
402,U.S.,2002,176,0.023004,-0.29594,-0.285476,-0.198295,,-0.25443,,,-0.1895,-0.325531
403,U.S.,2003,165,0.038527,-0.41788,-0.61992,-0.370023,,-0.231568,,,-0.27823,0.005449
404,U.S.,2004,211,0.055037,0.213867,0.390027,0.207644,,0.315883,,,0.279053,0.058851
405,U.S.,2005,261,0.055899,0.072926,0.047221,0.099971,,0.092312,,,0.043373,-0.288708
406,U.S.,2006,292,0.048195,0.234501,0.287861,0.171528,,0.115666,,,0.080332,0.054076
407,U.S.,2007,329,0.037062,0.125268,0.179399,0.074065,,0.155394,,,0.116498,0.795167
408,U.S.,2008,382,0.01076,-0.141199,0.009184,-0.053528,,-0.019322,,,-0.042388,0.453096
409,U.S.,2009,334,-0.02872,-0.49317,-0.45701,-0.348498,,-0.57214,-0.580574,,-0.512338,-0.789358
410,U.S.,2010,403,0.030378,0.22906,0.256841,0.223427,,0.288665,0.529582,,0.262575,0.406572


In [6]:
# go through all files in the path ../../Stored_Models/bayesian_univariate_time_series/
# and check the number of divergences
# if there are divergences, print the name of the model
divergence_dict = {}
path = "../../Stored_Models/bayesian_univariate_time_series/"
for file in os.listdir(path):
    temp_model = dill.load(open(path + file, "rb"))
    temp_trace = temp_model.get_trace()
    n_divergences = sum(temp_trace['sample_stats']['diverging'].values.flatten())
    divergence_dict[file] = n_divergences
divergence_df = pd.DataFrame(divergence_dict.items(), columns=['model', 'n_divergences'])
divergence_df = divergence_df.sort_values(by='n_divergences', ascending=False)
divergence_df.set_index('model', inplace=True)

In [7]:
divergence_df

Unnamed: 0_level_0,n_divergences
model,Unnamed: 1_level_1
British Islands_Pareto_2007_2020_constant_log_change_gdp_pc_log_change_FTSE100.pkl,1589
Asian Islands_Pareto_2010_2021_constant_log_change_gdp_pc_log_change_MSCI.pkl,1403
Japan_Pareto_2014_2021_constant_log_change_gdp_pc_log_change_MSCI.pkl,1258
Canada_Pareto_2010_2021_constant_log_change_gdp_pc.pkl,1165
Canada_Pareto_2010_2019_constant_log_change_gdp_pc_log_change_MSCI.pkl,1107
...,...
India_GeneralisedPareto_2009_2021_constant_log_change_gdp_pc.pkl,31
U.S._GeneralisedPareto_2005_2019_constant_log_change_gdp_pc.pkl,29
U.S._GeneralisedPareto_2005_2020_constant_log_change_gdp_pc.pkl,24
U.S._Weibull_2005_2021_constant_log_change_gdp_pc.pkl,23


In [8]:
# Notes:
# Alps: 20 obs from 2013 onwards -> train until 2021, predict 2022, use const, gdp and MSCI world
# Asian Islands: 25 obs from 2010 onwards -> train until 2019 & 2020 & 2021, predict 2020 & 2021 & 2022, use const, gdp and MSCI world
# Australia: 20 obs from 2013 onwards -> train until 2021, predict 2022, use const, gdp and MSCI world
# Brazil: 30 obs from 2011 onwards -> train until 2020 & 2021, predict 2021 & 2022, use const, gdp and MSCI world
# British Islands: 25 obs from 2007 onwards -> train until 2019 & 2020 & 2021, predict 2020 & 2021 & 2022, use const, gdp and FTSE100
# Canada: 24 obs from 2010 onwards -> train until 2019 & 2020 & 2021, predict 2020 & 2021 & 2022, use const, gdp and MSCI world
# China: 30 obs from 2007 onwards -> train until 2019 & 2020 & 2021, predict 2020 & 2021 & 2022, use const, gdp and SSE
# France: 25 obs from 2013 onwards -> train until 2021, predict 2022, use const, gdp and CAC40
# Germany: 40 obs from 2005 onwards -> train until 2019 & 2020 & 2021, predict 2020 & 2021 & 2022, use const, gdp and DAX
# India: 40 obs from 2009 onwards -> train until 2019 & 2020 & 2021, predict 2020 & 2021 & 2022, use const, gdp and NIFTY
# Israel + Turkey: 30 obs from 2010 onwards -> train until 2019 & 2020 & 2021, predict 2020 & 2021 & 2022, use const, gdp and MSCI world
# Italy: 30 obs from 2014 onwards -> train until 2021, predict 2022, use const, gdp and MSCI world
# Japan: 25 obs from 2014 onwards -> train until 2021, predict 2022, use const, gdp and MSCI world
# Russia: 45 obs from 2007 onwards -> train until 2019 & 2020 & 2021, predict 2020 & 2021 & 2022, use const, gdp and MOEX (only for 2015 onwards)
# Scandinavia: only from 2016 onwards due to OMX40 -> train until 2021, predict 2022, use const, gdp and OMX40
# South Korea: 30 obs from 2015 onwards -> train until 2021, predict 2022, use const, gdp and MSCI world
# Southeast Asia: 30 obs from 2013 onwards -> train until 2021, predict 2022, use const, gdp and MSCI world
# U.S.: train until 2019, 2020 and 2021, predict 2020, 2021 and 2022, use const, gdp and SPX

In [9]:
setups = [
    {
        "group": "Alps",
        "start_year": 2013,
        "end_year": [2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_MSCI"]
    },
    {
        "group": "Asian Islands",
        "start_year": 2010,
        "end_year": [2019, 2020, 2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_MSCI"]
    },
    {
        "group": "Australia",
        "start_year": 2013,
        "end_year": [2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_MSCI"]
    },
    {
        "group": "Brazil",
        "start_year": 2011,
        "end_year": [2020, 2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_MSCI"]
    },
    {
        "group": "British Islands",
        "start_year": 2007,
        "end_year": [2019, 2020, 2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_FTSE100"]
    },
    {
        "group": "Canada",
        "start_year": 2010,
        "end_year": [2019, 2020, 2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_MSCI"]
    },
    {
        "group": "China",
        "start_year": 2007,
        "end_year": [2019, 2020, 2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_SSE"]
    },
    {
        "group": "France",
        "start_year": 2013,
        "end_year": [2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_CAC40"]
    },
    {
        "group": "Germany",
        "start_year": 2005,
        "end_year": [2019, 2020, 2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_DAX"]
    },
    {
        "group": "India",
        "start_year": 2009,
        "end_year": [2019, 2020, 2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_NIFTY"]
    },
    {
        "group": "Israel + Turkey",
        "start_year": 2010,
        "end_year": [2019, 2020, 2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_MSCI"]
    },
    {
        "group": "Italy",
        "start_year": 2014,
        "end_year": [2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_MSCI"]
    },
    {
        "group": "Japan",
        "start_year": 2014,
        "end_year": [2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_MSCI"]
    },
    {
        "group": "Russia",
        "start_year": 2015,
        "end_year": [2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_MOEX"]
    },
    {
        "group": "Scandinavia",
        "start_year": 2016,
        "end_year": [2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_OMX40"]
    },
    {
        "group": "South Korea",
        "start_year": 2015,
        "end_year": [2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_MSCI"]
    },
    {
        "group": "Southeast Asia",
        "start_year": 2013,
        "end_year": [2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_MSCI"]
    },
    {
        "group": "U.S.",
        "start_year": 2005,
        "end_year": [2019, 2020, 2021],
        "covariates": ["constant", "log_change_gdp_pc", "log_change_SPX"]
    }
]

In [10]:
def train_or_retrieve_model(
    panel_df:pd.DataFrame, 
    group:str, 
    covariates:list[str],
    start_year:int,
    end_year:int, 
    model_type:str, 
    retrain_if_saved=False
):
    model_name = f"{group}_{model_type}_{start_year}_{end_year}"
    for covariate in covariates:
        model_name += f"_{covariate}"
    model_path = f"../../Stored_Models/bayesian_univariate_time_series/{model_name}.pkl"
    if os.path.exists(model_path) and not retrain_if_saved:
        with open(model_path, "rb") as f:
            model = dill.load(f)
    else:
        data = copy.deepcopy(panel_df[panel_df['group'] == group])
        data = data[data['year'] >= start_year]
        if model_type == 'Pareto':
            model = Univariate_Pareto_TimeSeries(
                panel_df=data,
                train_until=end_year,
                X_columns=covariates
            )
        elif model_type == 'Weibull':
            model = Univariate_Weibull_TimeSeries(
                panel_df=data,
                train_until=end_year,
                X_columns=covariates
            )
        elif model_type == 'GeneralisedPareto':
            model = Univariate_GeneralisedPareto_TimeSeries(
                panel_df=data,
                train_until=end_year,
                X_columns=covariates
            )
        model.fit(target_accept=0.99, nuts_sampler='pymc')
        with open(model_path, "wb") as f:
            dill.dump(model, f)
    return model

In [11]:
i = 0
for setup in setups:
    model_types = ['Pareto', 'Weibull', 'GeneralisedPareto']
    for model_type in model_types:
        for end_year in setup['end_year']:
            covariate_options = [setup['covariates'][0:2], setup['covariates']]
            for option in covariate_options:
                i += 1
                model_name = f"{setup['group']}_{model_type}_{setup['start_year']}_{end_year}"
                for covariate in option:
                    model_name += f"_{covariate}"
                model_name += ".pkl"
                print(f"Training model {i}: {model_name}")
                n_divergences = divergence_df.loc[model_name]['n_divergences']
                if n_divergences > 250 and model_type != 'Pareto':
                    retrain_if_saved = True
                else:
                    retrain_if_saved = False
                train_or_retrieve_model(
                    panel_df=df,
                    group=setup['group'],
                    covariates=option,
                    start_year=setup['start_year'],
                    end_year=end_year,
                    model_type=model_type,
                    retrain_if_saved=retrain_if_saved
                )

Training model 1: Alps_Pareto_2013_2021_constant_log_change_gdp_pc.pkl
Training model 2: Alps_Pareto_2013_2021_constant_log_change_gdp_pc_log_change_MSCI.pkl
Training model 3: Alps_Weibull_2013_2021_constant_log_change_gdp_pc.pkl
Training model 4: Alps_Weibull_2013_2021_constant_log_change_gdp_pc_log_change_MSCI.pkl
Training model 5: Alps_GeneralisedPareto_2013_2021_constant_log_change_gdp_pc.pkl
Training model 6: Alps_GeneralisedPareto_2013_2021_constant_log_change_gdp_pc_log_change_MSCI.pkl
Training model 7: Asian Islands_Pareto_2010_2019_constant_log_change_gdp_pc.pkl
Training model 8: Asian Islands_Pareto_2010_2019_constant_log_change_gdp_pc_log_change_MSCI.pkl
Training model 9: Asian Islands_Pareto_2010_2020_constant_log_change_gdp_pc.pkl
Training model 10: Asian Islands_Pareto_2010_2020_constant_log_change_gdp_pc_log_change_MSCI.pkl
Training model 11: Asian Islands_Pareto_2010_2021_constant_log_change_gdp_pc.pkl
Training model 12: Asian Islands_Pareto_2010_2021_constant_log_chang