NOTE: All models and code used here were developed purely for the purpose of illustration. We use a different set of models in production, although the principles are all the same. 

In [181]:
import os
import dill

import numpy as np
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
import arviz as az

from plotly.subplots import make_subplots
from cmdstanpy import CmdStanModel

pio.renderers.default = "notebook_connected"

First, we will read in the data that we saved our from the `loss_development.ipynb` notebook: 

In [182]:
MODEL_PATH = os.path.join("..", "..", "stan", "ar1_hierarchical.stan")
LOSS_DATA_PATH = os.path.join("..", "..", "data", "developed_losses.pkl")
LOSS_DEV_TEST_DATA_PATH = os.path.join("..", "..", "data", "loss_dev_test_data.pkl")
PREMIUM_DATA_PATH = os.path.join("..", "..", "data", "premium_data.pkl")

In [183]:
with open(LOSS_DATA_PATH, "rb") as infile:
    loss_pred = dill.load(infile)

with open(PREMIUM_DATA_PATH, "rb") as infile:
    premium_data = dill.load(infile)

with open(LOSS_DEV_TEST_DATA_PATH, "rb") as infile:
    loss_dev_test_data = dill.load(infile)

We will the total premium volume for each program as a covariate in the forecasting model, but first we will scale it by 1000 to make things more manageable: 

In [184]:
ult_premium = np.array(premium_data)[:,-1,-1]/1e3
ult_premium

array([5.6978000e+01, 1.3165000e+02, 1.9217000e+01, 1.6786200e+02,
       4.1560000e+00, 6.9057000e+01, 6.8987000e+01, 6.9154000e+01,
       3.6853000e+01, 4.7712000e+01, 2.9638000e+01, 1.8462300e+02,
       4.7720000e+01, 2.0509000e+01, 1.5065713e+04, 2.2052330e+03,
       2.4010000e+01, 4.0323000e+01, 2.2203200e+02, 5.5448900e+02,
       5.3853000e+01, 1.6078000e+01, 3.8510000e+00, 3.5600000e-01,
       1.7018300e+02])

Then, the input for our forecasting model is the posterior prediction for each accident year at the last development lag (this simplifies real-world applications, but works for our example):

In [185]:
# get the developed "right edge"
developed_losses = loss_pred[:,:,:,-1]

We then take the mean and standard deviation of each of these quantities so that we can propegate the uncertainty from the development to the forecasting model: 

In [186]:
developed_loss_means = np.nanmean(developed_losses, axis=0)
developed_loss_sds = np.nanstd(developed_losses, axis=0)

Formatting for Stan:  

In [187]:
stan_data = {
    "N": developed_loss_means.shape[0],
    "AY": developed_loss_means.shape[1],
    "ult_loss": developed_loss_means,
    "ult_loss_sd": developed_loss_sds,
    "ult_premium": ult_premium,
    "prior_only": 0,
    "N_AY_pred": 2,
}

# An AR1 Model with Measurement Error

For this example, the model itself is a simple AR1-style model where time is indicated by the accident year (remember, we now have the posterior mean and standard deviation for each "ultimate" loss ratio that we generated per the development model). 

Here, we propegate the uncertainty from the development model to the forecasting model by assuming that $\text{LR}_{\text{obs},t}$ (the posterior predicted mean from the development model for accident year $t$) follows from a gamma distribution with true mean $\text{LR}_{\text{true},t}$ (a free parameter) and $\text{SD}(\text{LR}_{\text{obs},t})$ (the posterior predicted standard deviation from the development model for accident year $t$). 

We also assume that the variance is a function of the size of the program, where premium volume is a proxy for program size. 

Finally, we assume that the true loss ratios follow a gamma distribution.

$$
\begin{align*}
    \text{LR}_{\text{true},t} &\sim \Gamma(\mu_t^2 / \sigma_t^2, \mu_t / \sigma_t^2)\\
    \mu_t &= \phi \cdot \text{LR}_{\text{true},t-1} + (1-\phi) \mu \\
    \sigma_t &= \exp(\sigma_\text{int} + \sigma_\text{slope} / \text{Premium}) \\
    \text{LR}_{\text{obs},t} &\sim \Gamma(\text{LR}_{\text{true},t}^2 / \text{SD}(\text{LR}_{\text{obs},t})^2, \text{LR}_{\text{true},t} / \text{SD}(\text{LR}_{\text{obs},t})^2)
\end{align*}
$$

Time for Stan!

In [None]:
ar1 = CmdStanModel(
    "ar1",
    stan_file=MODEL_PATH
)

In [None]:
# NOTE: setting stan_data["N_AY_pred"] tells us how many of the tail-end accident years we want to use as test data
stan_data["prior_only"] = 0
stan_data["N_AY_pred"] = 2
fit = ar1.sample(
    stan_data,
    iter_warmup=1000, 
    iter_sampling=1000,
    inits=0.2,
    chains=4,
    parallel_chains=4,
    adapt_delta=.8,
    seed=43215,
)

We get warnings like before, but again specific to the variable transforms we are doing for the gamma parameterization. Diagnostics look pretty good: 

In [None]:
fit.summary()

Some pairs plots: 

In [None]:
az.plot_pair(
    az.from_cmdstanpy(fit),
    var_names=['mu_target_lr', 'sigma_target_lr', 'reversion', 'sigma_int', 'sigma_slope'],
    kind='scatter',
    divergences=True,
)

And a function to plot the posterior predictions on observed versus true loss ratios: 

In [191]:
def plot_predictions(pred_ult_loss, loss_dev_test_data, program_idx):
    fig = go.Figure()

    x = np.arange(10)
    data_mu = np.nanmean(pred_ult_loss[:,program_idx,:], axis=0)
    data_lower = np.nanquantile(pred_ult_loss[:,program_idx,:], 0.05, axis=0)
    data_upper = np.nanquantile(pred_ult_loss[:,program_idx,:], 0.95, axis=0)

    colors = px.colors.sequential.Viridis

    fig.add_trace(
        go.Scatter(
            x=x, 
            y=data_mu, 
            mode="lines",
            marker=dict(color=colors[0]),
            showlegend=False
        ), 
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data_upper,
            mode='lines',
            marker=dict(color=colors[0]),
            line=dict(width=0),
            showlegend=False
        ),
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data_lower,
            mode='lines',
            marker=dict(color=colors[0]),
            line=dict(width=0),
            fill='tonexty',
            showlegend=False
        ),
    )
    dev_means = developed_loss_means[program_idx,:]
    dev_means[-stan_data["N_AY_pred"]:] = np.nan
    fig.add_trace(
        go.Scatter(
            x=x, 
            y=dev_means,
            error_y=dict(type='data', array=developed_loss_sds[program_idx,:], visible=True),
            mode='markers',
            name=None,
            showlegend=False,
            marker=dict(color=colors[0], size=10),
        ),     
    )
    fig.add_trace(
        go.Scatter(
            x=x, 
            y=dev_means,
            mode='markers',
            name=None,
            showlegend=False,
            marker=dict(
                color=[0 if i < (10-stan_data["N_AY_pred"]) else 1 for i in x], 
                colorscale=[[0, colors[0]], [1, "white"]]
            ),
        ),     
    )
    fig.add_trace(
        go.Scatter(
            x=x, 
            y=loss_dev_test_data[program_idx,:,-1],
            mode='markers',
            name=None,
            showlegend=False,
            marker=dict(color=colors[8]),
        ),     
    )

    return fig

The light green dots are the true loss ratios, purple dots and error bars the development model posterior predicted means+standard deviations, and the purple line+shaded interval the posterior predicted mean and uncertainty intervals for the forecasting model:

In [223]:
pred_ult_loss = fit.stan_variable(var="pred_ult_loss")
plot_predictions(pred_ult_loss, loss_dev_test_data, 1)