In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
sns.set_style('darkgrid')
sns.set_palette('viridis', n_colors=10)
colours = sns.color_palette('viridis', n_colors=10)
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool  #, MetricVisualizer

In [None]:
# Training CatBoost on a Kaggle kernel proved hard as 
# it quickly runs out of CPU RAM and the kernel restarts
# The following is an attempt to reduce the memory footprint
# https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [None]:

print('-' * 80)
print('train')
df_train = import_data('../input/allstate-claims-severity/train.csv').drop('id', axis=1)

print('-' * 80)
print('test')
df_test = import_data('../input/allstate-claims-severity/test.csv')
test_ids = df_test['id']
df_test = df_test.drop('id', axis=1)

In [None]:
print('Shape of training data:', df_train.shape)
print('Shape of test data:', df_test.shape)

In [None]:
train_nulls = df_train.isna().sum()
test_nulls = df_train.isna().sum()
print('Train nulls:', train_nulls[train_nulls != 0])
print('Test nulls:', test_nulls[test_nulls != 0])
del train_nulls
del test_nulls

No null values in either the train or test data, which is good news.

In [None]:
df_train.describe().T

The target variable, loss, appears to have a non-normal distribution, with the median significantly lower than the mean. This suggests that it has a positive skew, with a few very large insurance claims. This kind of distribution is common with amounts of money.

In [None]:
skew = stats.skew(df_train['loss'])
kurtosis = stats.kurtosis(df_train['loss'])

fig, ax = plt.subplots(figsize=(10, 5))
sns.histplot(data=df_train['loss'], ax=ax)
ax.set_title('Distribution of the target variable', 
             loc='left', 
             fontsize=16, 
             color=colours[0])
ax.set_xlabel('Loss (USD)')
textstr = f"Mean: {df_train['loss'].mean():,.2f}\
            \nMedian: {df_train['loss'].median():,.2f}\
            \nSkew: {skew:.2f}\nKurtosis: {kurtosis:.2f}"
props = dict(facecolor='k', alpha=0.2)
ax.text(x=90000, y=5000, s=textstr, fontsize=12, bbox=props)

plt.show()

del skew, kurtosis, fig, ax

In [None]:
transformed_loss = np.log1p(df_train['loss'])
df_train_transform = pd.concat([df_train.drop('loss', axis=1), transformed_loss], 
                               axis=1)
print('Length of train df post transformation:', df_train_transform.shape[0])

del transformed_loss

In [None]:
skew = stats.skew(df_train_transform['loss'])
kurtosis = stats.kurtosis(df_train_transform['loss'])

fig, ax = plt.subplots(figsize=(10, 5))
sns.histplot(data=df_train_transform['loss'], ax=ax)
ax.set_title('Distribution of the target variable', 
             loc='left', 
             fontsize=16, 
             color=colours[0])
ax.set_xlabel('Loss (USD)')
textstr = f"Mean: {df_train_transform['loss'].mean():,.2f}\
            \nMedian: {df_train_transform['loss'].median():,.2f}\
            \nSkew: {skew:.2f}\nKurtosis: {kurtosis:.2f}"
props = dict(facecolor='k', alpha=0.2)
ax.text(x=0.5, y=3000, s=textstr, fontsize=12, bbox=props)

plt.show()

del skew, kurtosis, fig, ax

There seems to be one datapoint that could be an outlier near zero, so I will remove this one row

In [None]:
index_to_remove = df_train_transform[df_train_transform['loss'] < 1].index
print('Number of outliers to remove:', len(index_to_remove))
df_train_transform = df_train_transform.drop(index_to_remove)
print('Number of train observations:', df_train_transform.shape[0])

del index_to_remove

In [None]:
cont_var = df_train.columns.values[-15:-1]
cat_var = df_train.columns.values[:-15]

In [None]:
# Check the skew of the other continuous variables
print('Skew of numerical variables:')
for var in cont_var:
    print(f'{var.ljust(7)}: {stats.skew(df_train[var]):.2f}')
    
del cont_var

In [None]:
# Check for correlation with the target or amongst the independent variables
correlation = df_train.corr()
fig, ax = plt.subplots(figsize=(10, 7))
sns.heatmap(correlation, annot=True, fmt='.2f', linewidth=2, ax=ax, cmap='viridis')
ax.set_title('Correlation of continuous variables', loc='left', fontsize=16, color=colours[3])
plt.show()

del correlation, fig, ax

# **Train and validation split**

Since there is a large number of training observations, I will split some out as a validation set which I will use in appraising the performance of my models and which I will be able to use for early stopping of training of boosting algorithms.

In [None]:
rng = np.random.RandomState(0)

In [None]:
n_observations = len(df_train_transform)
print('Number of training observations prior to split:', n_observations)
val_ratio = 0.2
n_val_observations = int(n_observations * val_ratio)
df_val = df_train_transform.sample(n=n_val_observations, 
                                    random_state=rng)
df_train = df_train_transform.drop(df_val.index)

print('======================================================')
print('Number of training observations post split:', df_train.shape[0])
print('Number of validation observations post split:', df_val.shape[0])

del n_observations, val_ratio

In [None]:
X_train = df_train.drop('loss', axis=1)
y_train = df_train['loss']
X_val =df_val.drop('loss', axis=1)
y_val = df_val['loss']

In [None]:
train_pool = Pool(data=X_train, 
                  label=y_train, 
                  cat_features=cat_var)
val_pool = Pool(data=X_val, 
                label=y_val, 
                cat_features=cat_var)

# **Modelling**

## Initial performance prior to tuning

In [None]:
model = CatBoostRegressor(iterations=4000,
                          verbose=200,
                          random_seed=101,
                          loss_function='MAE',
                          task_type='GPU')

In [None]:
model.fit(train_pool,
          eval_set=val_pool, 
          early_stopping_rounds=100)

In [None]:
# Visualise loss on the training and validation data
def plot_loss(model):
    """
    Plots a line graph of the training and validation error by iteration
    for a catboost estimator. 
    Takes the catboost estimator as a parameter. 
    """
    train_error = model.evals_result_['learn']['MAE']
    val_error = model.evals_result_['validation']['MAE']

    fig, ax = plt.subplots(figsize=(8, 5))
    ax.plot(train_error, color=colours[2], label='Training error')
    ax.plot(val_error, color=colours[6], label='Validation error')
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Log loss (MAE)')
    ax.set_title('Training and validation loss', loc='left', fontsize=16, color=colours[2])
    plt.legend()
    plt.show()
    

# Evaluate MAE of predictions in USD not log USD
def evaluate_predictions(log_act, log_pred):
    """
    Calculates the MAE for predictions made on a natural log scale.
    log_act: the log transformed actual observations
    log_pred: the log transformed predictions 
    
    Returns the inverse transformed MAE
    """
    inverse_pred = np.expm1(log_pred)
    inverse_act = np.expm1(log_act)
    return mean_absolute_error(inverse_act, inverse_pred)

In [None]:
# MetricVisualizer('Catboost_regressor').start()
plot_loss(model=model)

In [None]:
# Check the performance metric of the model on the inverse transformed target
pred = model.predict(X_val)
mae = evaluate_predictions(y_val, pred)
print(f'MAE: {mae:.4f}')

In [None]:
# Plotting the residuals
def plot_residuals(y_true, y_pred):
    residuals = y_true - y_pred
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

    axs[0].hist(residuals, bins=60, color=colours[1])
    ave = residuals.mean()
    med = residuals.median()
    skew = stats.skew(residuals)
    textstr = f'Mean: {ave:.2f}\nMedian: {med:.2f}\nSkew: {skew:.2f}'
#     axs[0].text(x=-5, 
#                  y=3500, 
#                  s=textstr, 
#                  fontsize=12)
    axs[0].annotate(textstr,
                    xy=(0.05, 0.8),
                    xycoords='axes fraction',
                    fontsize=12)
    axs[0].set_title('Distribution of residuals', 
                      loc='left', 
                      fontsize=16,
                      color=colours[1])
    axs[0].set_xlabel('Residuals (log USD)')

    axs[1].scatter(x=y_pred, y=residuals, color=colours[1], s=4)
    axs[1].hlines(y=0, xmin=6, xmax=11, linestyle='--', color=colours[-2], linewidth=3)
    axs[1].set_ylabel('Residuals (log USD)')
    axs[1].set_xlabel('Predicted loss (log USD)')
    axs[1].set_title('Residuals plot', 
                     loc='left', 
                     fontsize=16, 
                     color=colours[1])

    plt.tight_layout()
    plt.show()

In [None]:
plot_residuals(y_val, pred)

In the right hand plot there appears to be some heteroscedasticity in the error terms, however this likely just be due to the fact that there are many more insurance pay outs between 7 to 8 log USD and fewer around 9 log USD. 

The histogram on the left suggests that the error terms are normally distributed.|

In [None]:
del model, pred, mae
gc.collect()

## Hyper-parameter tuning

**NOTE: Unfortunately I was unable to get the good results from hyper-parameter tuning on the Kaggle GPU that I was getting locally on a CPU. Further, I cannot run the CatBoost training on a Kaggle CPU as it runs of memory and the kernel restarts. As such, I will just have to use the default hyper parameters.**

In [None]:
# params = {
#     'max_depth' : np.arange(4, 14, dtype=int),
#     'colsample_bylevel': stats.uniform(loc=0.2, scale=0.3),
#     'subsample': stats.uniform(loc=0.2, scale=0.3),
#     'l2_leaf_reg': stats.uniform(loc=1, scale=9),
#     'learning_rate': stats.uniform(loc=0.001, scale=0.199),
#     'min_child_samples': np.arange(10, 500, dtype=int),
#     'bagging_temperature': np.arange(1, 100, dtype=int),
#     'random_strength': np.arange(0, 100, dtype=int)
# }

In [None]:
# model = CatBoostRegressor(iterations=2000, 
#                           verbose=200, 
#                           random_seed=101, 
#                           loss_function='MAE')

In [None]:
# search = RandomizedSearchCV(model, 
#                             param_distributions=params,
#                             cv=3,
#                             n_iter=10, 
#                             verbose=2)

In [None]:
# results = search.fit(X=X_train,
#                      y=y_train,
#                      eval_set=(X_val, y_val),
#                      early_stopping_rounds=100,
#                      cat_features=cat_var)

# print(results.best_params_)

In [None]:
# best_params = {
#     'bagging_temperature': 88, 
#     'colsample_bylevel': 0.26769090768437076, 
#     'l2_leaf_reg': 9.132279703937915, 
#     'learning_rate': 0.054376348705150734, 
#     'max_depth': 9, 
#     'min_child_samples': 296, 
#     'random_strength': 74, 
#     'subsample': 0.45510384145401844
# }

In [None]:
# model = CatBoostRegressor(iterations=2500, 
#                           verbose=100, 
#                           random_seed=101, 
#                           loss_function='MAE', 
#                           train_dir='Catboost_regressor',
#                           task_type='GPU', 
#                           **best_params)

In [None]:
# model.fit(train_pool,
#           eval_set=val_pool,
#           early_stopping_rounds=250)

In [None]:
# MetricVisualizer('Catboost_regressor').start()
# plot_loss(model=model)

In [None]:
# Check the performance metric of the model on the inverse transformed target
# pred = model.predict(X_val)
# mae = evaluate_predictions(y_val, pred)
# print(f'MAE: {mae:.4f}')

## Removing categories unique to the training or testing data

There are numerous categorical features, which have categories only present in either the train or test data. These could be affecting model performance so I will remove these and replace them with 'null'. This will flag them as special values to the CatBoost model.

In [None]:
df_test['loss'] = np.NaN
df_full = pd.concat([df_train_transform, df_test])
print('Total number observations:', df_full.shape[0])

In [None]:
def filter_cat(x, to_remove):
    if x in to_remove:
        return 'null'
    else:
        return x

In [None]:
for col in list(df_train_transform.select_dtypes(include=['object']).columns):
    if df_train_transform[col].nunique() != df_test[col].nunique() or\
        df_train_transform[col].nunique() != df_full[col].nunique():
        set_train = set(df_train[col].unique())
        set_test = set(df_test[col].unique())
        remove_train = set_train - set_test
        remove_test = set_test - set_train
        remove = remove_train.union(remove_test)
        
        df_full[col] = df_full[col].apply(lambda x: filter_cat(x, remove), 1)

In [None]:
df_train_2 = df_full[df_full['loss'].notna()]
df_test_2 = df_full[df_full['loss'].isna()]

print(f'shape of train data: {df_train_2.shape}')
print(f'shape of test data: {df_test_2.shape}')

In [None]:
df_val_2 = df_train_2.sample(n=n_val_observations, 
                             random_state=rng)
df_train_2 = df_train_2.drop(df_val_2.index)

print('======================================================')
print('Number of training observations post split:', df_train_2.shape[0])
print('Number of validation observations post split:', df_val_2.shape[0])

In [None]:
X_train_2 = df_train_2.drop('loss', axis=1)
y_train_2 = df_train_2['loss']
X_val_2 =df_val_2.drop('loss', axis=1)
y_val_2 = df_val_2['loss']

del X_train, y_train, X_val, y_val, train_pool, val_pool

In [None]:
train_pool_2 = Pool(data=X_train_2, 
                  label=y_train_2, 
                  cat_features=cat_var)
val_pool_2 = Pool(data=X_val_2, 
                label=y_val_2, 
                cat_features=cat_var)

In [None]:
# best_params = {
#     'bagging_temperature': 88, 
#     'colsample_bylevel': 0.26769090768437076, 
#     'l2_leaf_reg': 9.132279703937915, 
#     'learning_rate': 0.054376348705150734, 
#     'max_depth': 9, 
#     'min_child_samples': 296, 
#     'random_strength': 74, 
#     'subsample': 0.45510384145401844
# }

In [None]:
model = CatBoostRegressor(
    iterations=4000, 
    verbose=200, 
    random_seed=101, 
    loss_function='MAE',
    task_type='GPU'
#     train_dir='Catboost_regressor_3',
#     **best_params
)

In [None]:
model.fit(
    train_pool_2,
    eval_set=val_pool_2,
    early_stopping_rounds=250
)

In [None]:
# MetricVisualizer('Catboost_regressor_3').start()
plot_loss(model=model)

In [None]:
# Check the performance metric of the model on the inverse transformed target
pred = model.predict(X_val_2)
mae = evaluate_predictions(y_val_2, pred)
print(f'MAE: {mae:.4f}')

A modest improvement. When I ran this locally on CPU, combined with the hyper parameter tuning, I was getting MAE in the range of roughly 1030 to 1050 USD, depending on the cut of the data. The validation loss was still decreasing so I will increase the number of iterations greatly for the final model.

In [None]:
# Plotting the residuals
plot_residuals(y_val_2, pred)

# **Final model**

I will train the model on all of the training observations (train + val), in addition to running more iterations. 

In [None]:
df_train_3 = df_full[df_full['loss'].notna()]
df_test_3 = df_full[df_full['loss'].isna()]

print(f'Number of training observations: {df_train_3.shape[0]}')
print(f'Number of testing observations: {df_test_3.shape[0]}')

In [None]:
X_train_3 = df_train_3.drop('loss', axis=1)
y_train_3 = df_train_3['loss']

In [None]:
train_pool_3 = Pool(data=X_train_3,
                    label=y_train_3,
                    cat_features=cat_var)

In [None]:
model = CatBoostRegressor(iterations=5000, 
                          verbose=300, 
                          random_seed=101, 
                          loss_function='MAE',
                          task_type='GPU')

In [None]:
model.fit(
    train_pool_3,
#     eval_set=val_pool, 
#     early_stopping_rounds=2000
)

In [None]:
model.save_model('insurance_claim_severity_catboost')

In [None]:
pred = model.predict(df_test_3)
inv_pred = np.expm1(pred)

In [None]:
submission = pd.DataFrame(
    data=inv_pred,
    columns=['loss']
)
submission.index = test_ids

In [None]:
submission.loss.hist(bins=100)
plt.show()

In [None]:
submission.to_csv('submission.csv')

In [None]:
# Scored 1122.15397 with a late submission