# XGBoost-Poisson-Response
This notebook gives an example of predicting a count / time response using xgboost.

## Import libraries

In [1]:
import pip
import pandas as pd
import numpy as np
import xgboost as xgb

## Download modelling data
Download the car dataset from http://www.businessandeconomics.mq.edu.au. <br> 
This is the dataCar dataset from the insuranceData R package (https://cran.r-project.org/web/packages/insuranceData/index.html). <br> 
The dataset includes claims count as well as policy exposure for each record so is suitable for modelling as poisson (with an offset).

In [2]:
site = 'http://www.businessandeconomics.mq.edu.au/our_departments/'
site_subfolder = 'Applied_Finance_and_Actuarial_Studies/acst_docs/glms_for_insurance_data/data/'
file_url = site + site_subfolder + 'car.csv'
data = pd.read_csv(file_url)
data.head()

Unnamed: 0,veh_value,exposure,clm,numclaims,claimcst0,veh_body,veh_age,gender,area,agecat,_OBSTAT_
0,1.06,0.303901,0,0,0.0,HBACK,3,F,C,2,01101 0 0 0
1,1.03,0.648871,0,0,0.0,HBACK,2,F,A,4,01101 0 0 0
2,3.26,0.569473,0,0,0.0,UTE,2,F,E,2,01101 0 0 0
3,4.14,0.317591,0,0,0.0,STNWG,2,F,D,2,01101 0 0 0
4,0.72,0.648871,0,0,0.0,HBACK,4,F,C,2,01101 0 0 0


In [3]:
data['log_exposure'] = np.log(data['exposure'])

## Transform categorical variables to dummy variables

In [4]:
dummy_cols = []
for col in ['gender', 'area', 'veh_body']:
    dummies = pd.get_dummies(data[col])
    dummies.columns = [col + '_' + lvl for lvl in dummies.columns.values]
    dummy_cols.extend(dummies.columns.values)
    data[dummies.columns.values] = dummies
data.head()

Unnamed: 0,veh_value,exposure,clm,numclaims,claimcst0,veh_body,veh_age,gender,area,agecat,...,veh_body_HBACK,veh_body_HDTOP,veh_body_MCARA,veh_body_MIBUS,veh_body_PANVN,veh_body_RDSTR,veh_body_SEDAN,veh_body_STNWG,veh_body_TRUCK,veh_body_UTE
0,1.06,0.303901,0,0,0.0,HBACK,3,F,C,2,...,1,0,0,0,0,0,0,0,0,0
1,1.03,0.648871,0,0,0.0,HBACK,2,F,A,4,...,1,0,0,0,0,0,0,0,0,0
2,3.26,0.569473,0,0,0.0,UTE,2,F,E,2,...,0,0,0,0,0,0,0,0,0,1
3,4.14,0.317591,0,0,0.0,STNWG,2,F,D,2,...,0,0,0,0,0,0,0,1,0,0
4,0.72,0.648871,0,0,0.0,HBACK,4,F,C,2,...,1,0,0,0,0,0,0,0,0,0


## Build xgboost model

In [5]:
expl_cols = ['veh_value', 'veh_age', 'agecat'] + dummy_cols

# reponse and explanatory columns
dtrain = xgb.DMatrix(data = data[expl_cols], 
                     label = data['numclaims'])

# set offset through base_margin
dtrain.set_base_margin(data['log_exposure'])

# set parameters
param = {'max_depth': 2, 
         'eta': 1, 
         'silent': 1, 
         'objective': 'count:poisson'}

# build model
num_round = 50
bst = xgb.train(param, dtrain, num_round)

## Check predicted values

In [6]:
preds = bst.predict(dtrain)

Average # of predicted claims.

In [7]:
preds.mean()

0.07276963

Total # of predicted claims

In [8]:
preds.sum()

4937.856

Total number of actual claims

In [9]:
data['numclaims'].sum()

4937

Overall predicted claims frequency

In [10]:
preds.sum() / sum(data['exposure'])

0.1552744920333865

Actual claims frequency

In [11]:
data['numclaims'].sum() / sum(data['exposure'])

0.15524757583850632

## Functions to print package info

In [12]:
def get_pkg_version(pkg_name, depth = 0):
    """Function to print package version and dependency versions."""
    x = [[pkg.key, pkg.version, pkg.requires()] \
         for pkg in pip.get_installed_distributions() if pkg.key in [pkg_name]][0]
    print('  ' * depth, x[:2])
    if (len(x[2]) > 0):
        for i in range(len(x[2])):
            get_pkg_version(x[2][i].key, depth + 1)

def print_pkg_info(pkg_names):
    """Function to call get_pkg_version for multiple packages."""
    for name in pkg_names:
        get_pkg_version(name)

## Record packages used in analysis

In [13]:
print_pkg_info(['xgboost', 'pandas', 'pip'])

 ['xgboost', '0.7']
   ['numpy', '1.14.0']
   ['scipy', '0.19.1']
 ['pandas', '0.20.3']
   ['python-dateutil', '2.6.1']
     ['six', '1.11.0']
   ['pytz', '2017.2']
   ['numpy', '1.14.0']
 ['pip', '9.0.1']
