# Modelling motor insurance claim frequency
Using the French Motor Claims dataset as an example to work through predictive modelling considerations. It is designed to be run on a Kaggle Kernel here: <https://www.kaggle.com/btw78jt/models-of-french-motor-claims>

<!-- This table of contents is updated *manually* -->
# Contents
1. [Setup](#Setup)
1. [Modelling data](#Modelling-data): Load data, Pre-processing, Split for modelling
1. [Modelling setup](#Model:-GLM): Setup modelling, Modelling notes, Useful functions, Mean model
1. [Simple features model](#Simple-features-model): Fit and score, Visualise fit (Lift plot, Individual factors)
1. [Feature selection](#Feature-selection): Simple factors, Engineered features
1. [Proposed model](#Proposed-model): Fit and score, Visualise fit
1. [Output results](#Output-results) Fit on all train, Score on all validation, Save
1. [Rough work only](#Rough-work-only)

<div align="right" style="text-align: right"><a href="#Contents">Back to Contents</a></div>

# Setup

In [None]:
# Set warning messages
import warnings
# Show all warnings in IPython
warnings.filterwarnings('always')
# Ignore specific numpy warnings (as per <https://github.com/numpy/numpy/issues/11788#issuecomment-422846396>)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
# Other warnings that sometimes come up
warnings.filterwarnings("ignore", message="unclosed file <_io.TextIOWrapper")
warnings.filterwarnings("ignore", message="Anscombe residuals currently unscaled")

In [None]:
# Import built-in modules
import sys
import platform
import os
from pathlib import Path
import string

# Import external modules
from IPython import __version__ as IPy_version
import IPython.display as ipyd
import numpy as np
import pandas as pd
from sklearn import __version__ as skl_version
from sklearn.model_selection import train_test_split
import matplotlib as mpl
import matplotlib.pyplot as plt
from bokeh import __version__ as bk_version
from scipy import __version__ as scipy_version
from statsmodels import __version__ as sm_version
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import __version__ as patsy_version

# Check they have loaded and the versions are as expected
assert platform.python_version_tuple() == ('3', '6', '6')
print(f"Python version:\t\t{sys.version}")
assert IPy_version == '7.13.0'
print(f'IPython version:\t{IPy_version}')
assert np.__version__ == '1.18.2'
print(f'numpy version:\t\t{np.__version__}')
assert pd.__version__ == '0.25.3'
print(f'pandas version:\t\t{pd.__version__}')
assert skl_version == '0.22.2.post1'
print(f'sklearn version:\t{skl_version}')
assert mpl.__version__ == '3.2.1'
print(f'matplotlib version:\t{mpl.__version__}')
assert bk_version == '2.0.1'
print(f'bokeh version:\t\t{bk_version}')
assert scipy_version == '1.4.1'
print(f'scipy version:\t\t{scipy_version}')
assert sm_version == '0.11.0'
print(f'statsmodels version:\t{sm_version}')
assert patsy_version == '0.5.1'
print(f'patsy version:\t\t{patsy_version}')

In [None]:
# Bokeh imports
from bokeh.layouts import gridplot
from bokeh.plotting import figure, output_file, show, output_notebook
from bokeh.models.ranges import Range1d
from bokeh.models.axes import LinearAxis

# Load Bokeh for use in a notebook
from bokeh.io import output_notebook
output_notebook()

In [None]:
# Configuration variables
claims_data_filepath = Path('/kaggle/input/french-motor-claims-datasets-fremtpl2freq/freMTPL2freq.csv')

In [None]:
# Output exact environment specification, in case it is needed later
print("Capturing full package environment spec")
print("(But note that not all these packages are required)")
!pip freeze > requirements_Kaggle.txt
!jupyter --version > jupyter_versions.txt

<div align="right" style="text-align: right"><a href="#Contents">Back to Contents</a></div>

# Modelling data

## Load data

In [None]:
# Load full data set
expected_dtypes = {
    **{col: np.dtype('int64') for col in [
        'IDpol', 'ClaimNb', 'VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density']},
    **{col: np.dtype('float64') for col in ['Exposure']},
    **{col: np.dtype('O') for col in ['Area', 'VehBrand', 'VehGas', 'Region']},
}
df_raw = pd.read_csv(claims_data_filepath, delimiter=',', dtype=expected_dtypes)

In [None]:
# Check it has loaded OK
nRows, nCols = (678013, 12)
assert df_raw.shape == (nRows, nCols)
print(f"Correct: Shape of DataFrame is as expected: {nRows} rows, {nCols} cols")
assert df_raw.dtypes.equals(pd.Series(expected_dtypes)[df_raw.columns])
print("Correct: Data types are as expected")
assert df_raw.isna().sum().sum() == 0
print("Correct: There are no missing values in the raw dataset")

<div align="right" style="text-align: right"><a href="#Contents">Back to Contents</a></div>

## Pre-processing

In [None]:
def get_df_extra(df):
    """
    Given a DataFrame of that contains the raw data columns (and possibly additional columns), 
    return the DataFrame with additional pre-processed columns
    """
    df_extra = df.copy()
    
    # Calculate frequency per year on each row
    df_extra['Frequency'] = df_extra['ClaimNb'] / df_extra['Exposure']
    
    # Feature engineering (the results of the analysis below)
    VehBrand_map_sers = pd.Series({
        'B12': 'X', 'B14': 'X', 'B13': 'X',
        'B3': 'X', 'B11': 'X','B4': 'X', 'B5': 'X',
        'B1': 'Y', 'B6': 'Y',
        'B2': 'Z', 'B10': 'Z'
    })

    Region_map_sers = pd.Series({
        **{reg: 'W' for reg in ['R21', 'R94', 'R11', 'R42', 'R22', 'R74']},
        **{reg: 'X' for reg in ['R91', 'R82']},
        **{reg: 'Y' for reg in ['R93', 'R53']},
        **{reg: 'Z' for reg in ['R26', 'R25', 'R52', 'R31', 'R54', 'R73', 
                                'R23', 'R72', 'R83', 'R41', 'R43']},
        **{reg: 'A' for reg in ['R24']},
    })

    df_extra = df_extra.assign(
        DrivAge_capped=lambda x: np.clip(x.DrivAge, None, 80),
        DrivAge_pow2=lambda x: np.power(x.DrivAge_capped, 2),
        BonusMalus_over_50=lambda x: np.select([x.BonusMalus > 50], ["Y"], default="N"),
        BonusMalus_mod3=lambda x: np.floor((np.clip(x.BonusMalus, None, 90) - 48)/3)*3 + 50,
        VehAge_new=lambda x: np.select([x.VehAge == 0], ["Y"], default="N"),
        VehAge_capped=lambda x: np.clip(x.VehAge, None, 18),
        VehBrand_grd=lambda x: VehBrand_map_sers.loc[x.VehBrand].values,
        Density_log=lambda x: np.log10(np.clip(x.Density, 10, np.power(10, 4))),
        Region_grd=lambda x: Region_map_sers.loc[x.Region].values,
    )
    
    return(df_extra)  

In [None]:
# Run pre-processing to get a new DataFrame
df_extra = get_df_extra(df_raw)

## Split for modelling

In [None]:
# Get index sorted with ascending IDpol, just in case it is out or order
df_all = df_extra.sort_values('IDpol').reset_index(drop=True)

# Proportions we want to split in (must sum to 1)
split_props = pd.Series({
    'train': 0.7,
    'validation': 0.15,
    'holdout': 0.15
})

# Split out training data
df_train, df_not_train = train_test_split(
    df_all, test_size=(1 - split_props['train']), random_state=51, shuffle=True
)
# Split remaining data between validation and holdout
df_validation, df_holdout = train_test_split(
    df_not_train, test_size=split_props['holdout'] / (1 - split_props['train']), random_state=13, shuffle=True
)

In [None]:
# Check all rows have been accounted for
pd.concat([df_train, df_validation, df_holdout]).sort_index().equals(df_all)

<div align="right" style="text-align: right"><a href="#Contents">Back to Contents</a></div>

# Modelling setup

## Setup modelling

In [None]:
# Split training into train and test
df_train_mod, df_test_mod = train_test_split(
    df_train, test_size=0.3, random_state=34, shuffle=True
)
print("Train sample size: " + str(df_train_mod.shape))

# For testing code, we'll only use a small sample.
# The object has the *same name*, so that *omitting* to run
# this line will allow the same code to run on the full sample.
# small_samp_size = int(1e4)
# _, df_train_mod = train_test_split(
#     df_train_mod, test_size=small_samp_size, random_state=90, shuffle=True
# )
# print("Small sample size: " + str(df_train_mod.shape))

In [None]:
# View first few rows
df_train_mod.head()

In [None]:
expl_var_names = [
    col_name for col_name in df_train_mod.columns.to_list() 
     if col_name not in ['IDpol', 'ClaimNb', 'Exposure', 'Frequency']
]
print("Explanatory variables\n" + '\t'.join(expl_var_names))
simple_features = expl_var_names[:9]
print("\nOf which the following are simple features\n" + '\t'.join(simple_features))

In [None]:
# Initialise DataFrame for holding model info
mods_df = pd.DataFrame(np.empty(0, dtype=np.dtype([
    ('mod_name', np.dtype('O')),
    ('descr', np.dtype('O')),
    ('GLMResults', np.dtype('O')),
])))

# Dictionary of scores on the data
scored_dfs = dict()

## Modelling notes

#### Model specification
We want to use a Poisson GLM (with log link) to model expected number of claims per year, taking account of the fact that observations have variable exposure. The relevant `statsmodels` [documentation](https://www.statsmodels.org/devel/examples/notebooks/generated/glm_weights.html#aggregated-or-averaged-data-(unique-values-of-explanatory-variables)), shows that the following are equivalent ways of specifying such a model in `sm.GLM()`:

1. With the response as the *sum* of claim counts (i.e. `ClaimNb`), and passing `exposure=data['Exposure']` (which is possible because we are using a log link).
1. With response as the *mean* (i.e. frequency) of the claims per exposure (i.e. `freq_pyr`), and passing `var_weights=data['Exposure']`. 

Further notes showing that these are equivalent are here: <https://www.kaggle.com/btw78jt/explaining-glms>

We'll use the first option here.

## Useful functions

In [None]:
def score_data(data_df, GLMRes_obj):
    raw_exog_names = pd.Series(GLMRes_obj.model.exog_names[1:]).str.split(
        '[', expand=True, n=1).iloc[:,0].drop_duplicates().to_list()
    scored_df = data_df.assign(
        wgt=lambda x: x.Exposure,
        act_freq=lambda x: x[GLMRes_obj.model.endog_names] / x.wgt,
        pred_freq=lambda x: GLMRes_obj.predict(x[raw_exog_names]),
        act_Nb=lambda x: x[GLMRes_obj.model.endog_names],
        pred_Nb=lambda x: x.pred_freq * x.wgt,
    )
    return(scored_df)

In [None]:
def get_cut_grps(df, cut_by, n_bins):
    if isinstance(n_bins, int):
        return(pd.cut(df[cut_by], bins=n_bins))
    if n_bins == 'cat':
        return(df[cut_by])
    if n_bins == 'all':
        bins = np.sort(df[cut_by].unique())
        offset = np.min(np.diff(bins)) / 2
        bins = np.insert(bins, 0, 2*bins[0] - bins[1]) + offset
        return(pd.cut(df[cut_by], bins=bins))

In [None]:
def get_agg_plot_data(
    data_df,
    order_by = None,
    cut_by = None,
    x_axis_var = None,
    n_bins = None,
    set_config = None
):
    # Set defaults
    if set_config == "lift":
        if order_by is None:
            order_by = 'pred_freq'
        if cut_by is None:
            cut_by = 'cum_wgt'
        if x_axis_var is None:
            x_axis_var = cut_by
        if n_bins is None:
            n_bins = 10
    if cut_by is None:
        cut_by = order_by
    if x_axis_var is None:
        x_axis_var = cut_by

    plt_data_df = data_df\
    .rename_axis(index='index').sort_values([order_by, 'index']).assign(
        cum_wgt_raw=lambda x: x.wgt.cumsum(),
        cum_wgt=lambda x: x.groupby(order_by).cum_wgt_raw.transform('max'),
        grp=lambda df: get_cut_grps(df, cut_by, n_bins)
    ).groupby('grp', sort=False).agg(
        n_obs=('grp', 'size'), 
        wgt_sum=('wgt', 'sum'),
        act_Nb=('act_Nb', 'sum'),
        pred_Nb=('pred_Nb', 'sum'),
        x_min=(x_axis_var, 'min'),
        x_max=(x_axis_var, 'max'),
    ).pipe(lambda x: (
        x.reset_index(drop=True).pipe(lambda x: (
            x.set_index(pd.interval_range(start=-0.5, periods=x.shape[0], freq=1.))
        )) if n_bins == 'cat' 
        else x.set_index(x.index.categories)
    )).assign(
        act_av=lambda x: x.act_Nb / x.wgt_sum,
        pred_av=lambda x: x.pred_Nb / x.wgt_sum,
        x_left=lambda x: np.select(
            [x.x_min == x.x_max],
            [x.index.left], 
            default=x.x_min
        ),
        x_right=lambda x: np.select(
            [x.x_min == x.x_max],
            [x.index.right],
            default=x.x_max
        ),
        x_mid=lambda x: (x.x_right + x.x_left) / 2,
    )
    return(plt_data_df)

In [None]:
def create_plot(plt_data_df, n_bins = None):
    bkplt = figure(
        title="Predicted vs Actual chart", x_axis_label='Pred val', y_axis_label="Exposure", 
        tools="reset,box_zoom,pan,wheel_zoom,save", background_fill_color="#fafafa",
        plot_width=800, plot_height=500
    )
    bkplt.quad(
        top=plt_data_df.wgt_sum, bottom=0, left=plt_data_df.x_left, right=plt_data_df.x_right,
        fill_color="khaki", line_color="white", legend_label="Exposure"
    )
    bkplt.y_range=Range1d(0, plt_data_df.wgt_sum.max() / 0.5)

    y_range2_name = 'y_range2_name'
    bkplt.extra_y_ranges[y_range2_name] = Range1d(
        plt_data_df[['act_av', 'pred_av']].min().min(), 
        plt_data_df[['act_av', 'pred_av']].max().max() / 0.9
    )
    ax_new = LinearAxis(y_range_name=y_range2_name, axis_label="Average response")
    bkplt.add_layout(ax_new, 'right')

    for col_name, color in [('pred_av', 'purple'), ('act_av', 'green')]:
        bkplt.circle(
            plt_data_df.x_mid, plt_data_df[col_name], 
            color=color, size=4,
            y_range_name=y_range2_name,
            legend_label=col_name
        )
        bkplt.line(
            plt_data_df.x_mid, plt_data_df[col_name], 
            color=color, # size=4,
            y_range_name=y_range2_name,
            legend_label=col_name
        )

    bkplt.grid.grid_line_color = "white"
    bkplt.legend.location = "top_left"
    bkplt.legend.click_policy="hide"

    if n_bins == "cat":
        x_tick_labs = plt_data_df.x_min.astype(str).reset_index(drop=True).to_dict()
        bkplt.xaxis.ticker = list(x_tick_labs.keys())
        bkplt.xaxis.major_label_overrides = x_tick_labs
    
    return(bkplt)

In [None]:
def get_cat_map_ordered(df, cat_col_name, order_by='act_av_freq'):
    res = df.groupby(cat_col_name).agg(
        n_obs=(cat_col_name, 'size'),
        wgt_sum=('Exposure', 'sum'),
        act_Nb=('ClaimNb', 'sum'),
    ).assign(
        act_av_freq=lambda x: x.act_Nb / x.wgt_sum
    ).sort_values(
        [order_by], ascending=False
    ).assign(
        all_levels=lambda x: x.reset_index().index
    )
    return(res)

## Mean model
Just for checking that the code is working for the simplest case.

In [None]:
%%time
GLMRes_obj = smf.glm(
    "ClaimNb ~ 1",
    data=df_train_mod, exposure=np.asarray(df_train_mod['Exposure']),
    family=sm.families.Poisson(sm.genmod.families.links.log()),
).fit()
print(GLMRes_obj.summary())

In [None]:
# Check that this is the mean model
mean_mod_pred = np.exp(GLMRes_obj.params[0])
assert np.abs(
    GLMRes_obj.family.link.inverse(GLMRes_obj.params[0]) - 
    GLMRes_obj.predict(pd.DataFrame([1]))[0]
) < 1e-10
assert np.abs(
    df_train_mod.ClaimNb.sum() / df_train_mod.Exposure.sum() - 
    mean_mod_pred
) < 1e-10
print("Correct: Reasonableness tests have passed")

<div align="right" style="text-align: right"><a href="#Contents">Back to Contents</a></div>

# Simple features model

## Fit and score

In [None]:
%%time
# Takes approx 20 secs
mods_df.loc[0, ['mod_name', 'descr']] = [
    'All_simple_features', 
    'All simple features'
]
mods_df.GLMResults[0] = smf.glm(
    "ClaimNb ~ " +  ' + '.join(simple_features),
    data=df_train_mod, exposure=np.asarray(df_train_mod['Exposure']),
    family=sm.families.Poisson(sm.genmod.families.links.log()),
).fit()
print(mods_df.GLMResults[0].summary())

In [None]:
%%time
# Score all the training data for analysis
# Takes under 10 secs
scored_dfs[0] = score_data(df_train, mods_df.GLMResults[0])

In [None]:
# Reasonableness checks
assert np.abs(scored_dfs[0].act_Nb.sum() - df_train.ClaimNb.sum()) < 1e-7
assert np.abs(scored_dfs[0].wgt.sum() - df_train.Exposure.sum()) < 1e-7
print("Correct: Reasonableness checks pass\n")
print(f"Predicted number of claims:\t{scored_dfs[0].pred_Nb.sum():,.1f}")
print(f"Actual number of claims:\t{scored_dfs[0].ClaimNb.sum():,.1f}")
print(f"Difference:\t\t\t{scored_dfs[0].pred_Nb.sum() - scored_dfs[0].ClaimNb.sum():,.1f}")

## Visualise fit

In [None]:
# Sample data for visualisations
df_extra_for_plt = scored_dfs[0].loc[
    # Data not used for training
    ~scored_dfs[0].index.isin(df_train_mod.index), :
#].iloc[:int(1e5), :  # Limit the number of rows for the plot
].copy()

In [None]:
# Checks
print("Number of rows")
print(
    "Scored minus train_mod size:\t" + 
    str(scored_dfs[0].shape[0] - df_train_mod.shape[0])
)
print("Test_mod size:\t\t\t" + str(df_test_mod.shape[0]))
print("Data for plots size:\t\t" + str(df_extra_for_plt.shape[0]))

### Lift plot

In [None]:
lift_plt_data_df = get_agg_plot_data(df_extra_for_plt, set_config = "lift")
lift_plt = create_plot(lift_plt_data_df)
show(lift_plt)

In [None]:
lift_actuals = lift_plt_data_df.act_av.iloc[-1] / lift_plt_data_df.act_av.iloc[0]
lift_pred = lift_plt_data_df.pred_av.iloc[-1] / lift_plt_data_df.pred_av.iloc[0]
print(f'Lift on actuals:\t{lift_actuals:.3f}')
print(f'Lift on predicted:\t{lift_pred:.3f}')

### Individual factors
In the following, I interactively examined each factor. Only the final plot is shown, and I've recorded some decisions to refine the feature engineering.

In [None]:
# Ordering and bucketing categorical variables
Area_map_df = get_cat_map_ordered(df_extra_for_plt, 'Area')
Region_map_df = get_cat_map_ordered(df_extra_for_plt, 'Region')
VehBrand_map_df = get_cat_map_ordered(df_extra_for_plt, 'VehBrand')

### Policy holder factors

#### DrivAge
- Cap at high values due to low exposure.
- Not doing a good job at low ages (under 25). Try adding a quadratic term to the model.

In [None]:
indiv_plt_data_df = get_agg_plot_data(
    df_extra_for_plt.assign(
        DrivAge_capped=lambda x: np.clip(x.DrivAge, None, 80)
    ),
    order_by = 'DrivAge_capped', #cut_by = 'cum_wgt', x_axis_var = 'DrivAge',
    n_bins = 'all'
)
indiv_plt = create_plot(indiv_plt_data_df)
show(indiv_plt)

#### BonusMalus
- Majority of weight is on one value (50). And predictions are not extreme enough on this split. Add a binary variable. 
- Long tail - cap at around 90.
- Other than that, values of multiple of 3 seem to be more popular. Possibly an increasing trend in that section.

In [None]:
n_bins = 'all'
indiv_plt_data_df = get_agg_plot_data(
    df_extra_for_plt.query("BonusMalus > 50").assign(
        BonusMalus_over_50=lambda x: np.select([x.BonusMalus > 50], ["Y"], default="N"),
        BonusMalus_capped=lambda x: np.clip(x.BonusMalus, None, 90),
        BonusMalus_mod3=lambda x: np.floor((x.BonusMalus_capped - 48)/3)*3 + 50,
    ),
    order_by = 'BonusMalus_mod3', #cut_by = 'cum_wgt', x_axis_var = 'BonusMalus',
    n_bins = n_bins
)
indiv_plt = create_plot(indiv_plt_data_df, n_bins=n_bins)
show(indiv_plt)

### Vehicle factors

#### VehAge
- Cap at high values due to low exposure.
- Not doing a good job for the lowest age. Arguably flat after this value. Add a binary factor (although there isn't much exposure here).

In [None]:
indiv_plt_data_df = get_agg_plot_data(
    df_extra_for_plt.assign(
        VehAge_new=lambda x: np.select([x.VehAge == 0], ["Y"], default="N"),
        VehAge_capped=lambda x: np.clip(x.VehAge, None, 18),
    ),
    order_by = 'VehAge_capped', #cut_by = 'cum_wgt', x_axis_var = 'DrivAge',
    n_bins = 'all'
)
indiv_plt = create_plot(indiv_plt_data_df)
show(indiv_plt)

#### VehBrand
Some levels with low exposure. It is, however, difficult to group into levels with that are reasonably equally weighted, in a specified order (e.g. with actual frequency). Ended up going for 3 levels.

In [None]:
n_grps = 3
VehBrand_map_df = VehBrand_map_df.assign(
    cum_wgt=lambda x: x.wgt_sum.cumsum(),
    grps=lambda x: pd.cut(
        x.cum_wgt, bins=n_grps, 
        labels=[letter for letter in string.ascii_uppercase[(-n_grps):]]
    ).astype(str)
)

n_bins = 'cat'
indiv_plt_data_df = get_agg_plot_data(
    df_extra_for_plt.assign(
        VehBrand_ord=lambda x: VehBrand_map_df.all_levels.loc[x.VehBrand].values,
        VehBrand_grd=lambda x: VehBrand_map_df.grps.loc[x.VehBrand].values,
    ),
    order_by = 'VehBrand_grd', #cut_by = 'VehBrand_grd', #x_axis_var = 'BonusMalus',
    n_bins = n_bins
)
indiv_plt = create_plot(indiv_plt_data_df, n_bins=n_bins)
show(indiv_plt)

In [None]:
# Print out groups to use
print("{" + ", ".join([
    f"'{grp}': '{grp_code}'" for grp, grp_code 
    in VehBrand_map_df.grps.to_dict().items()
]) + "}")

In [None]:
# Check against the hard-coded feature
# Any difference should be small (e.g. in the smallest levels by wgt)
VehBrand_map_df.merge(
    df_extra_for_plt[[
        'VehBrand', 'VehBrand_grd']].drop_duplicates(
        ).set_index('VehBrand'),
    how="outer", left_index=True, right_index=True
).assign(
    same_allocation=lambda x: x.VehBrand_grd == x.grps
).sort_values('all_levels')

#### VehGas
The factor only has two levels, but the model is not matching well (although the range is small). Suggests trying interaction with `VehBrand`, but I'm not going to try interactions at this stage.

In [None]:
n_bins = 'cat'
indiv_plt_data_df = get_agg_plot_data(
    df_extra_for_plt,
    order_by = 'VehGas', #cut_by = 'VehBrand_grd', #x_axis_var = 'BonusMalus',
    n_bins = n_bins
)
indiv_plt = create_plot(indiv_plt_data_df, n_bins=n_bins)
show(indiv_plt)

### Location factors

#### Area
Seems to be a trend in the actuals, and the buckets are reasonably well spread. Leave it as is.

In [None]:
n_bins = 'cat'
indiv_plt_data_df = get_agg_plot_data(
    df_extra_for_plt.assign(
        Area_ord=lambda x: Area_map_df.all_levels.loc[x.Area].values,
    ),
    order_by = 'Area_ord', #cut_by = 'VehBrand_grd', #x_axis_var = 'BonusMalus',
    n_bins = n_bins
)
indiv_plt = create_plot(indiv_plt_data_df, n_bins=n_bins)
show(indiv_plt)

#### Density
A very wide range of values with a long tail. Try logging and clipping the extremes. Although the predicted values follow the actuals pretty well.

In [None]:
n_bins = 20
indiv_plt_data_df = get_agg_plot_data(
    df_extra_for_plt.assign(
        Density_log=lambda x: np.log10(np.clip(x.Density, 10, np.power(10, 4))),
    ),
    order_by = 'Density_log', #cut_by = 'cum_wgt', x_axis_var = 'Density_log',
    n_bins = n_bins
)
indiv_plt = create_plot(indiv_plt_data_df, n_bins=n_bins)
show(indiv_plt)

#### Region
Uneven distribution over regions suggests some grouping. This seems to be easier than for `VehBrand` - settled on 4 groups as a reasonable number.

In [None]:
n_grps = 4
Region_map_df = Region_map_df.assign(
    cum_wgt=lambda x: x.wgt_sum.cumsum(),
    grps=lambda x: pd.cut(
        x.cum_wgt, bins=n_grps, 
        labels=[letter for letter in string.ascii_uppercase[(-n_grps):]]
    ).astype(str),
    manual_grps=pd.Series({
        **{reg: 'W' for reg in ['R21', 'R94', 'R11', 'R42', 'R22', 'R74']},
        **{reg: 'X' for reg in ['R91', 'R82']},
        **{reg: 'Y' for reg in ['R93', 'R53']},
        **{reg: 'Z' for reg in ['R26', 'R25', 'R52', 'R31', 'R54', 'R73', 
                                'R23', 'R72', 'R83', 'R41', 'R43']},
        **{reg: 'A' for reg in ['R24']},
    })
)

n_bins = 'cat'
indiv_plt_data_df = get_agg_plot_data(
    df_extra_for_plt.assign(
        Region_ord=lambda x: Region_map_df.all_levels.loc[x.Region].values,
        Region_grd=lambda x: Region_map_df.manual_grps.loc[x.Region].values,
    ),
    order_by = 'Region_grd', #cut_by = 'VehBrand_grd', #x_axis_var = 'BonusMalus',
    n_bins = n_bins
)
indiv_plt = create_plot(indiv_plt_data_df, n_bins=n_bins)
show(indiv_plt)

In [None]:
# Print out groups to use
print('\n'.join([
    f"**{{reg: '{code}' for reg in {grps}}}," for
    code, grps in Region_map_df.reset_index().groupby('grps').agg({
        'Region': lambda x: [reg for reg in x]
    }).Region.to_dict().items()
]))

In [None]:
# Check against the hard-coded feature
# Any difference should be small (e.g. in the smallest levels by wgt)
Region_map_df.merge(
    df_extra_for_plt[[
        'Region', 'Region_grd']].drop_duplicates(
        ).set_index('Region'),
    how="outer", left_index=True, right_index=True
).assign(
    same_allocation=lambda x: x.Region_grd == x.manual_grps
).sort_values('all_levels').style.bar(subset=['wgt_sum', 'act_av_freq'], color='#d65f5f')

<div align="right" style="text-align: right"><a href="#Contents">Back to Contents</a></div>

# Feature selection
Try selecting simple factors in stepwise regression using AIC. We want to *minimise* the AIC.

In [None]:
# The modelling is quite intensive on memory,
# so we need to limit the amount of data used
_, data_swreg_df = train_test_split(
    df_train_mod, test_size=0.5, random_state=98, shuffle=True
)

In [None]:
def fit_candidate(
    selected, candidate, 
    n_step, n_iter, # Adds these values to the model list
    mods_swreg_df, # Add results to this model list
    data_swreg_df, # Use this data set
    direction = None, verbose = True
):
    """Fit a model with an additional 'candidate' factor"""
    if direction is None:
        direction = 'fwd'
    
    if direction == 'fwd':
        if candidate is "None":
            factors_ls = selected
        else:
            factors_ls = selected + [candidate]
    else:
        factors_ls = selected.copy()
        if candidate is not "None":
            factors_ls.remove(candidate)
    
    mods_swreg_df.loc[n_iter, ['step', 'candidate', 'rhs_str']] = [
        n_step,
        candidate,
        ' + '.join(['1'] + factors_ls)
    ]

    GLMRes_tmp = smf.glm(
        "ClaimNb ~ " + mods_swreg_df.loc[n_iter, 'rhs_str'],
        data=data_swreg_df,
        exposure=np.asarray(data_swreg_df['Exposure']),
        family=sm.families.Poisson(sm.genmod.families.links.log()),
    ).fit()

    mods_swreg_df.loc[n_iter, 'aic'] = GLMRes_tmp.aic
    mods_swreg_df.loc[n_iter, 'df_model'] = GLMRes_tmp.df_model
    GLMRes_tmp.remove_data() # Clear up to save RAM
    mods_swreg_df.loc[n_iter, 'GLMResults'] = GLMRes_tmp
    
    if verbose:
        print(
            f"Iter: {n_iter:<2}\t"
            f"Candidate: {candidate:<10}\t"
            f"AIC: {mods_swreg_df.loc[n_iter, 'aic']:.1f}"
        )

    return(mods_swreg_df)

In [None]:
# Initialise DataFrame for holding model info
mods_swreg_df = pd.DataFrame(np.empty(int(0), dtype=np.dtype([
    ('step', np.dtype('int64')),
    ('candidate', np.dtype('O')),
    ('rhs_str', np.dtype('O')),
    ('GLMResults', np.dtype('O')),
    ('df_model', np.dtype('int64')),
    ('aic', np.dtype('float64')),
])))

n_iter = 0
n_step = 0
remaining = simple_features.copy()
direction = 'fwd'
if direction == 'fwd':
    selected = []
else:
    selected = remaining.copy()
go_to_next_step = True

In [None]:
%%time

run_this_chunk = True
if run_this_chunk:

    # The below loop does not cover the initial model
    # Takes approx 4 mins
    mods_swreg_df = fit_candidate(
        selected=selected, candidate="None", n_step=n_step, n_iter=n_iter,
        mods_swreg_df=mods_swreg_df, data_swreg_df=data_swreg_df,
        direction=direction, verbose=True
    )

    while remaining and go_to_next_step:
        n_step += 1
        go_to_next_step = False
        print(
            f"==== Step {n_step:02} ====\n"
            f"Already selected: {', '.join(selected)}\n"
            f"Remaining: {', '.join(remaining)}"
        )

        for candidate in remaining:
            n_iter += 1
            mods_swreg_df = fit_candidate(
                selected, candidate, n_step, n_iter,
                mods_swreg_df, data_swreg_df,
                direction=direction, verbose=True
            )

        mods_swreg_df = mods_swreg_df.sort_values(
            ['step', 'aic', 'rhs_str']).reset_index(drop=True)
        best_aic_so_far = mods_swreg_df.aic.min()
        best_in_step = mods_swreg_df.query(f"step == {n_step}")[['candidate', 'aic']].iloc[0]

        if best_in_step.aic == best_aic_so_far:
            if direction == 'fwd':
                selected.append(best_in_step.candidate)
            else:
                selected.remove(best_in_step.candidate)
            remaining.remove(best_in_step.candidate)
            print(
                f"Step {n_step:02} "
                f"selected: {best_in_step.candidate:<10} "
                f"(AIC: {best_in_step.aic:.1f})"
            )
            go_to_next_step = True
        else:
            print(
                f"Step {n_step:02} "
                f"selected: None\n"
                f"==== Stepwise regression complete ===="
            )

In [None]:
# Save and view results
if run_this_chunk:
    mods_fw_swreg_df = mods_swreg_df.copy()
    best_mod = mods_fw_swreg_df.query("aic == @mods_fw_swreg_df.aic.min()").iloc[0]
    print("Best model for forward regression:")
    print("-" * 40)
    print(
        "Formula:\t" + best_mod.rhs_str + "\n"
        f"aic:\t\t{best_mod.aic:.2f}\n"
        f"df_model:\t{int(best_mod.df_model)}"
    )

In [None]:
# Initialise DataFrame for holding model info
mods_swreg_df = pd.DataFrame(np.empty(int(0), dtype=np.dtype([
    ('step', np.dtype('int64')),
    ('candidate', np.dtype('O')),
    ('rhs_str', np.dtype('O')),
    ('GLMResults', np.dtype('O')),
    ('df_model', np.dtype('int64')),
    ('aic', np.dtype('float64')),
])))

n_iter = 0
n_step = 0
remaining = simple_features.copy()
direction = 'bwd'
if direction == 'fwd':
    selected = []
else:
    selected = remaining.copy()
go_to_next_step = True

In [None]:
%%time

run_this_chunk = True
if run_this_chunk:

    # The below loop does not cover the initial model
    # Takes approx 5 mins
    mods_swreg_df = fit_candidate(
        selected=selected, candidate="None", n_step=n_step, n_iter=n_iter,
        mods_swreg_df=mods_swreg_df, data_swreg_df=data_swreg_df,
        direction=direction, verbose=True
    )

    while remaining and go_to_next_step:
        n_step += 1
        go_to_next_step = False
        print(
            f"==== Step {n_step:02} ====\n"
            f"Already selected: {', '.join(selected)}\n"
            f"Remaining: {', '.join(remaining)}"
        )

        for candidate in remaining:
            n_iter += 1
            mods_swreg_df = fit_candidate(
                selected, candidate, n_step, n_iter,
                mods_swreg_df, data_swreg_df,
                direction=direction, verbose=True
            )

        mods_swreg_df = mods_swreg_df.sort_values(
            ['step', 'aic', 'rhs_str']).reset_index(drop=True)
        best_aic_so_far = mods_swreg_df.aic.min()
        best_in_step = mods_swreg_df.query(f"step == {n_step}")[['candidate', 'aic']].iloc[0]

        if best_in_step.aic == best_aic_so_far:
            if direction == 'fwd':
                selected.append(best_in_step.candidate)
            else:
                selected.remove(best_in_step.candidate)
            remaining.remove(best_in_step.candidate)
            print(
                f"Step {n_step:02} "
                f"selected: {best_in_step.candidate:<10} "
                f"(AIC: {best_in_step.aic:.1f})"
            )
            go_to_next_step = True
        else:
            print(
                f"Step {n_step:02} "
                f"selected: None\n"
                f"==== Stepwise regression complete ===="
            )

In [None]:
# Save results
if run_this_chunk:
    mods_bw_swreg_df = mods_swreg_df.copy()
    best_mod = mods_bw_swreg_df.query("aic == @mods_bw_swreg_df.aic.min()").iloc[0]
    print("Best model for backward regression:")
    print("-" * 40)
    print(
        "Formula:\t" + best_mod.rhs_str + "\n"
        f"aic:\t\t{best_mod.aic:.2f}\n"
        f"df_model:\t{int(best_mod.df_model)}"
    )

### Add selected model to list

In [None]:
%%time
# Takes approx 20 secs
mods_df.loc[1, ['mod_name', 'descr']] = [
    'Selected simple features', 
    'Following forward and backward stepwise regression'
]
mods_df.GLMResults[1] = smf.glm(
    "ClaimNb ~ 1 + " + " + ".join(  # <<< Manually specify the factors
        [fac for fac in simple_features if not fac in ["Density", "VehGas"]]),
    data=df_train_mod, exposure=np.asarray(df_train_mod['Exposure']),
    family=sm.families.Poisson(sm.genmod.families.links.log()),
).fit()
print(mods_df.GLMResults[1].summary())

In [None]:
# Clear memory of unrequired objects
mods_bw_swreg_df = None
mods_fw_swreg_df = None
mods_swreg_df = None

In [None]:
%%time
# Score data for analysis
# Takes under 10 secs
scored_dfs[1] = score_data(df_train, mods_df.GLMResults[1])

In [None]:
# Reasonableness checks
assert np.abs(scored_dfs[1].act_Nb.sum() - df_train.ClaimNb.sum()) < 1e-7
assert np.abs(scored_dfs[1].wgt.sum() - df_train.Exposure.sum()) < 1e-7
print("Correct: Reasonableness checks pass\n")
print(f"Predicted number of claims:\t{scored_dfs[1].pred_Nb.sum():,.1f}")
print(f"Actual number of claims:\t{scored_dfs[1].ClaimNb.sum():,.1f}")
print(f"Difference:\t\t\t{scored_dfs[1].pred_Nb.sum() - scored_dfs[1].ClaimNb.sum():,.1f}")

### Engineered features
Consider adding these in one at a time. Can we improve AIC?

In [None]:
# These are the engineered features
df_train_mod.loc[:,'Frequency':].iloc[:,1:].head()

In [None]:
# Choose between the simple and engineered features
remaining = [
    'DrivAge_capped', 'DrivAge_pow2', 
    'BonusMalus_over_50', 'BonusMalus_mod3', 
    'VehAge_new', 'VehAge_capped', 
    'VehBrand_grd',
    'Density_log', 
    'Region_grd'
] + [fac for fac in simple_features if not fac in [
    # These factors are superseded by their engineered versions
    "DrivAge", "BonusMalus", "VehAge",
    "VehBrand", "Region", "Density", 
    # And not including these factors
    "VehGas"
]]

# Initialise DataFrame for holding model info
mods_swreg_df = pd.DataFrame(np.empty(int(0), dtype=np.dtype([
    ('step', np.dtype('int64')),
    ('candidate', np.dtype('O')),
    ('rhs_str', np.dtype('O')),
    ('GLMResults', np.dtype('O')),
    ('df_model', np.dtype('int64')),
    ('aic', np.dtype('float64')),
])))

n_iter = 0
n_step = 0
direction = 'fwd'
if direction == 'fwd':
    selected = []
else:
    selected = remaining.copy()
go_to_next_step = True

In [None]:
%%time

run_this_chunk = True
if run_this_chunk:
    # Takes approx 4 mins
    # The below loop does not cover the initial model
    mods_swreg_df = fit_candidate(
        selected=selected, 
        candidate="None", n_step=n_step, n_iter=n_iter,
        mods_swreg_df=mods_swreg_df, data_swreg_df=data_swreg_df,
        direction=direction, verbose=True
    )

    while remaining and go_to_next_step:
        n_step += 1
        go_to_next_step = False
        print(
            f"==== Step {n_step:02} ====\n"
            f"Already selected: {', '.join(selected)}\n"
            f"Remaining: {', '.join(remaining)}"
        )

        for candidate in remaining:
            n_iter += 1
            mods_swreg_df = fit_candidate(
                selected, candidate, n_step, n_iter,
                mods_swreg_df, data_swreg_df,
                direction=direction, verbose=True
            )

        mods_swreg_df = mods_swreg_df.sort_values(
            ['step', 'aic', 'rhs_str']).reset_index(drop=True)
        best_aic_so_far = mods_swreg_df.aic.min()
        best_in_step = mods_swreg_df.query(f"step == {n_step}")[['candidate', 'aic']].iloc[0]

        if best_in_step.aic == best_aic_so_far:
            if direction == 'fwd':
                selected.append(best_in_step.candidate)
            else:
                selected.remove(best_in_step.candidate)
            remaining.remove(best_in_step.candidate)
            print(
                f"Step {n_step:02} "
                f"selected: {best_in_step.candidate:<10} "
                f"(AIC: {best_in_step.aic:.1f})"
            )
            go_to_next_step = True
        else:
            print(
                f"Step {n_step:02} "
                f"selected: None\n"
                f"==== Stepwise regression complete ===="
            )

In [None]:
# Save and view results
if run_this_chunk:
    mods_fw_swreg_extra_df = mods_swreg_df.copy()
    best_mod = mods_fw_swreg_extra_df.query("aic == @mods_fw_swreg_extra_df.aic.min()").iloc[0]
    print("Best model for forward regression:")
    print("-" * 40)
    print(
        "Formula:\t" + best_mod.rhs_str + "\n"
        f"aic:\t\t{best_mod.aic:.2f}\n"
        f"df_model:\t{int(best_mod.df_model)}"
    )

In [None]:
# The same but backward from all factors
remaining = [
    'DrivAge_capped', 'DrivAge_pow2', 
    'BonusMalus_over_50', 'BonusMalus_mod3', 
    'VehAge_new', 'VehAge_capped', 
    'VehBrand_grd',
    'Density_log', 
    'Region_grd'
] + [fac for fac in simple_features if not fac in [
    # These factors are superseded by their engineered versions
    "DrivAge", "BonusMalus", "VehAge",
    "VehBrand", "Region", "Density", 
    # And not including these factors
    "VehGas"
]]

# Initialise DataFrame for holding model info
mods_swreg_df = pd.DataFrame(np.empty(int(0), dtype=np.dtype([
    ('step', np.dtype('int64')),
    ('candidate', np.dtype('O')),
    ('rhs_str', np.dtype('O')),
    ('GLMResults', np.dtype('O')),
    ('df_model', np.dtype('int64')),
    ('aic', np.dtype('float64')),
])))

n_iter = 0
n_step = 0
direction = 'bwd' # <<<<<<<<< Direction of stepwise regression
if direction == 'fwd':
    selected = []
else:
    selected = remaining.copy()
go_to_next_step = True

In [None]:
%%time

run_this_chunk = True
if run_this_chunk:
    # The below loop does not cover the initial model
    # Takes approx 2 mins
    mods_swreg_df = fit_candidate(
        selected=selected, 
        candidate="None", n_step=n_step, n_iter=n_iter,
        mods_swreg_df=mods_swreg_df, data_swreg_df=data_swreg_df,
        direction=direction, verbose=True
    )

    while remaining and go_to_next_step:
        n_step += 1
        go_to_next_step = False
        print(
            f"==== Step {n_step:02} ====\n"
            f"Already selected: {', '.join(selected)}\n"
            f"Remaining: {', '.join(remaining)}"
        )

        for candidate in remaining:
            n_iter += 1
            mods_swreg_df = fit_candidate(
                selected, candidate, n_step, n_iter,
                mods_swreg_df, data_swreg_df,
                direction=direction, verbose=True
            )

        mods_swreg_df = mods_swreg_df.sort_values(
            ['step', 'aic', 'rhs_str']).reset_index(drop=True)
        best_aic_so_far = mods_swreg_df.aic.min()
        best_in_step = mods_swreg_df.query(f"step == {n_step}")[['candidate', 'aic']].iloc[0]

        if best_in_step.aic == best_aic_so_far:
            if direction == 'fwd':
                selected.append(best_in_step.candidate)
            else:
                selected.remove(best_in_step.candidate)
            remaining.remove(best_in_step.candidate)
            print(
                f"Step {n_step:02} "
                f"selected: {best_in_step.candidate:<10} "
                f"(AIC: {best_in_step.aic:.1f})"
            )
            go_to_next_step = True
        else:
            print(
                f"Step {n_step:02} "
                f"selected: None\n"
                f"==== Stepwise regression complete ===="
            )

In [None]:
# Save and view results
if run_this_chunk:
    mods_bw_swreg_extra_df = mods_swreg_df.copy()
    best_mod = mods_bw_swreg_extra_df.query("aic == @mods_bw_swreg_extra_df.aic.min()").iloc[0]
    print("Best model for backward regression:")
    print("-" * 40)
    print(
        "Formula:\t" + best_mod.rhs_str + "\n"
        f"aic:\t\t{best_mod.aic:.2f}\n"
        f"df_model:\t{int(best_mod.df_model)}"
    )

Again the selected models are the same, this time containing *all* the factors. We'll need to apply some judgement to choose the factors that go into the proposed model.

<div align="right" style="text-align: right"><a href="#Contents">Back to Contents</a></div>

# Proposed model

## Fit and score

In [None]:
chosen_factors = [
    'DrivAge_capped', 'DrivAge_pow2', 
    'BonusMalus_over_50', 'BonusMalus_mod3', 
    'VehAge_new', 'VehAge_capped', 
    'VehBrand_grd',
    'Density_log', 
    'Region_grd'
] + [fac for fac in simple_features if not fac in [
    # These factors are superseded by their engineered versions
    "DrivAge", "BonusMalus", "VehAge",
    "VehBrand", "Region", "Density", 
    # And not including these factors
    "VehGas"
]]
chosen_factors

In [None]:
%%time
# Takes approx 20 secs
mods_df.loc[2, ['mod_name', 'descr']] = [
    'Proposed model', 
    'Following assessment of engineered features'
]
mods_df.GLMResults[2] = smf.glm(
    "ClaimNb ~ 1 + " + " + ".join(  # <<< Manually specify the factors
        chosen_factors
    ),
    data=df_train_mod, exposure=np.asarray(df_train_mod['Exposure']),
    family=sm.families.Poisson(sm.genmod.families.links.log()),
).fit()
print(mods_df.GLMResults[2].summary())

In [None]:
# Clear memory of unrequired objects
mods_fw_swreg_extra_df = None
mods_bw_swreg_extra_df = None
mods_swreg_df = None

In [None]:
%%time
# Score data for analysis
# Takes under 10 secs
scored_dfs[2] = score_data(df_train, mods_df.GLMResults[2])

In [None]:
# Reasonableness checks
assert np.abs(scored_dfs[2].act_Nb.sum() - df_train.ClaimNb.sum()) < 1e-7
assert np.abs(scored_dfs[2].wgt.sum() - df_train.Exposure.sum()) < 1e-7
print("Correct: Reasonableness checks pass\n")
print(f"Predicted number of claims:\t{scored_dfs[2].pred_Nb.sum():,.1f}")
print(f"Actual number of claims:\t{scored_dfs[2].ClaimNb.sum():,.1f}")
print(f"Difference:\t\t\t{scored_dfs[2].pred_Nb.sum() - scored_dfs[2].ClaimNb.sum():,.1f}")

## Visualise fit
The model with selected simple features, and then the proposed model.

In [None]:
df_for_plt1 = scored_dfs[1].loc[
    scored_dfs[1].index.isin(df_test_mod.index), :
]
assert df_for_plt1.shape[0] == df_test_mod.shape[0]

In [None]:
lift_plt_data_df = get_agg_plot_data(df_for_plt1, set_config = "lift")
lift_plt = create_plot(lift_plt_data_df)
show(lift_plt)

In [None]:
lift_actuals = lift_plt_data_df.act_av.iloc[-1] / lift_plt_data_df.act_av.iloc[0]
lift_pred = lift_plt_data_df.pred_av.iloc[-1] / lift_plt_data_df.pred_av.iloc[0]
print(f'Lift on actuals:\t{lift_actuals:.3f}')
print(f'Lift on predicted:\t{lift_pred:.3f}')

In [None]:
df_for_plt2 = scored_dfs[2].loc[
    scored_dfs[2].index.isin(df_test_mod.index), :
]
assert df_for_plt2.shape[0] == df_test_mod.shape[0]

In [None]:
lift_plt_data_df = get_agg_plot_data(df_for_plt2, set_config = "lift")
lift_plt = create_plot(lift_plt_data_df)
show(lift_plt)

In [None]:
lift_actuals = lift_plt_data_df.act_av.iloc[-1] / lift_plt_data_df.act_av.iloc[0]
lift_pred = lift_plt_data_df.pred_av.iloc[-1] / lift_plt_data_df.pred_av.iloc[0]
print(f'Lift on actuals:\t{lift_actuals:.3f}')
print(f'Lift on predicted:\t{lift_pred:.3f}')

Both models have similar results. In the absence of any other more siginificant consideration, we choose the model with the engineered features as it more simple (i.e. has lower degrees of freedom), so will possibly generalise better to unseen data.

<div align="right" style="text-align: right"><a href="#Contents">Back to Contents</a></div>

# Output results
This involves:
- Fitting the model on all the training data
- Scoring the validation data
- Plotting lift on the validation data
- Saving the output

In [None]:
%%time
# Takes approx 20 secs
mods_df.loc[3, ['mod_name', 'descr']] = [
    'Output model', 
    'Same as proposed but on all the training data'
]
mods_df.GLMResults[3] = smf.glm(
    "ClaimNb ~ 1 + " + " + ".join(
        chosen_factors
    ),
    # Use all of the training data
    data=df_train, exposure=np.asarray(df_train['Exposure']),
    family=sm.families.Poisson(sm.genmod.families.links.log()),
).fit()
print(mods_df.GLMResults[3].summary())

In [None]:
%%time
# Score *validation* data for reporting
# Takes under 10 secs
scored_dfs[3] = score_data(df_validation, mods_df.GLMResults[3])

In [None]:
# Reasonableness checks
assert np.abs(scored_dfs[3].act_Nb.sum() - df_validation.ClaimNb.sum()) < 1e-7
assert np.abs(scored_dfs[3].wgt.sum() - df_validation.Exposure.sum()) < 1e-7
print("Correct: Reasonableness checks pass\n")
print(f"Predicted number of claims:\t{scored_dfs[3].pred_Nb.sum():,.1f}")
print(f"Actual number of claims:\t{scored_dfs[3].ClaimNb.sum():,.1f}")
print(f"Difference:\t\t\t{scored_dfs[3].pred_Nb.sum() - scored_dfs[3].ClaimNb.sum():,.1f}")

In [None]:
lift_plt_data_df = get_agg_plot_data(scored_dfs[3], set_config = "lift")
lift_plt = create_plot(lift_plt_data_df)
show(lift_plt)

In [None]:
lift_actuals = lift_plt_data_df.act_av.iloc[-1] / lift_plt_data_df.act_av.iloc[0]
lift_pred = lift_plt_data_df.pred_av.iloc[-1] / lift_plt_data_df.pred_av.iloc[0]
print(f'Lift on actuals:\t{lift_actuals:.3f}')
print(f'Lift on predicted:\t{lift_pred:.3f}')

### Save results

In [None]:
# Output scored validation data
scored_dfs[3].to_pickle("df_validation_GLM_preds.gzip")

In [None]:
# Output model
mods_df.GLMResults[3].save("GLMResults_obj.pkl")

In [None]:
# Check they have saved
!ls -lh

<div align="right" style="text-align: right"><a href="#Contents">Back to Contents</a></div>

# Rough work only

In [None]:
# To score data manually, we need to an object for 
# converting data into design matrix fields

# import patsy
# _, des_mx_obj = patsy.dmatrices(
#     mods_df.GLMResults[0].model.formula, 
#     df_train_mod, # Needs to be the data used for fitting the model
#     return_type='dataframe'
# )
# patsy.build_design_matrices(
#     [des_mx_obj.design_info],
#     scored_df.loc[4554,:],
#     return_type='dataframe'
# )[0]

### Partial residuals
It seems we need to calculate these manually for features that account for multiple columns in the design matrix.

#### Definitions
$$
\textrm{Working residuals: } r_i^W = \frac{y_i - \hat{y}_i}{g'(\eta_i)} = 
\frac{y_i - \hat{y}_i}{\hat{y}_i} \textrm{ for Poisson}\\
\textrm{Partial residuals for covariate }k\textrm{: } r_i^{[k]} = r_i^W + x_{ik}\hat{\beta}_k
$$

Presumably, if there are multiple $\beta_k$ for a term, we need to add all the associated $x_{ik}\hat{\beta}_k$. Consider whether they should be weighted (by exposure). Consider what we expect these to show - do we need to have centred the $x_i$ first?

In [None]:
# NOT COMPLETE

# import patsy

# def score_data(data_df, GLMRes_obj):
#     raw_exog_names = pd.Series(GLMRes_obj.model.exog_names[1:]).str.split(
#         '[', expand=True, n=1).iloc[:,0].drop_duplicates().to_list()
#     scored_df = data_df.assign(
#         wgt=lambda x: x.Exposure,
#         act_freq=lambda x: x[GLMRes_obj.model.endog_names] / x.wgt,
#         pred_freq=lambda x: GLMRes_obj.predict(x[raw_exog_names]),
#         act_Nb=lambda x: x[GLMRes_obj.model.endog_names],
#         pred_Nb=lambda x: x.pred_freq * x.wgt,
#         resid_Nb=lambda x: x.act_Nb - x.pred_Nb,
#     )
#     return(scored_df)

# GLMRes_obj = mods_df.GLMResults[0]
# training_data_df = df_train_mod
# scored_data_df = scored_dfs[0].copy()

# raw_exog_names = pd.Series(GLMRes_obj.model.exog_names[1:]).str.split(
#         '[', expand=True, n=1).iloc[:,0].drop_duplicates().to_list()

# data_df = df_train.iloc[:1000,:]
# GLMRes_obj = mods_df.GLMResults[0]
# _, des_mx_obj = patsy.dmatrices(
#     GLMRes_obj.model.formula, 
#     df_train_mod,
#     return_type='matrix'
# )

# data_df = score_data(data_df, GLMRes_obj)

# data_dmx = patsy.build_design_matrices(
#     [des_mx_obj.design_info],
#     data_df,
#     return_type='matrix'
# )[0]

# data_df = pd.concat([
#     data_df,
#     pd.DataFrame(np.array([
#         np.matmul(
#             data_dmx[:, term_slice],
#             GLMRes_obj.params[term_slice]
#         )
#         for _, term_slice 
#         in des_mx_obj.design_info.term_name_slices.items()
#     ]).T, columns=[
#         'clp_' + term_name for term_name 
#         in data_dmx.design_info.term_names
#     ], index=data_df.index)
# ], sort=False, axis=1).assign(**{
#     ('cpr_' + exog_name): lambda x: np.log(x.act_freq) - np.log(x.pred_freq) + x['clp_' + exog_name]
#     for exog_name in raw_exog_names
# })

# data_df

<div align="right" style="text-align: right"><a href="#Contents">Back to Contents</a></div>