In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import statsmodels.api as sm

import xgboost as xgb
import math

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/french-motor-claims-datasets-fremtpl2freq/freMTPL2freq.csv


<div style="
  border-radius: 20px;
  padding: 25px;
  background: radial-gradient(circle at top, #cf1323, #000043);
  border: 2px solid #00ffc8;
  text-align: center;
  box-shadow: 0 0 25px rgba(0, 255, 250, 0.25);
">
  <h1 style="
    font-size: 28px;
    font-family: 'Trebuchet MS', sans-serif;
    letter-spacing: 2px;
    color: #00ffc8;
    text-shadow: 0 0 12px rgba(0,255,200,0.5);
  ">
    <span style="color:#7aa6ec5;">Load</span> <span style="color:#7aa6ec5;">the</span> <span style="color:#7aa6ec5;">DataFrame</span>
  </h1>
</div>

In [2]:
df = pd.read_csv('/kaggle/input/french-motor-claims-datasets-fremtpl2freq/freMTPL2freq.csv')
df = df.drop(['IDpol'], axis=1)

In [3]:
df.head()

Unnamed: 0,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region
0,1,0.1,D,5,0,55,50,B12,Regular,1217,R82
1,1,0.77,D,5,0,55,50,B12,Regular,1217,R82
2,1,0.75,B,6,2,52,50,B12,Diesel,54,R22
3,1,0.09,B,7,0,46,50,B12,Diesel,76,R72
4,1,0.84,B,7,0,46,50,B12,Diesel,76,R72


<div style="
  border-radius: 20px;
  padding: 25px;
  background: radial-gradient(circle at top, #cf1223, #000043);
  border: 2px solid #00ffc8;
  text-align: center;
  box-shadow: 0 0 25px rgba(0, 255, 200, 0.25);
">
  <h1 style="
    font-size: 28x;
    font-family: 'Trebuchet MS', sans-serif;
    letter-spacing: 2px;
    color: #00ffc8;
    text-shadow: 0 0 12px rgba(0,255,200,0.5);
  ">
    <span style="color:#7aa6ec5;">Exploratory</span> <span style="color:#7aa6ec2;">Data</span> <span style="color:#7aa6ec1;">Analysis</span>
  </h1>
</div>

The definitions of each column:

- ClaimNb: Claim numbers during exposure period
- Exposure: Period of exposure for a policy in year(s)
- VehPower: Car power, in categorical format
- VehAge: Car age, in continuous value format
- DrivAge: Driver’s age in years, in continuous value format
- BonusMalus: Bonus / Malus rating, <100 means bonus, >100 means malus, in continous value format
- VehBrand: Car brand, in categorical format
- VechGas: Fuel type, in categorical format
- Area: Density rating of the area car driver lives in, in categorical format
- Density: Density rating, number of inhabitants per km^2, in continous value format
- Region: Policy region in France, in categorical format

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 678013 entries, 0 to 678012
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ClaimNb     678013 non-null  int64  
 1   Exposure    678013 non-null  float64
 2   Area        678013 non-null  object 
 3   VehPower    678013 non-null  int64  
 4   VehAge      678013 non-null  int64  
 5   DrivAge     678013 non-null  int64  
 6   BonusMalus  678013 non-null  int64  
 7   VehBrand    678013 non-null  object 
 8   VehGas      678013 non-null  object 
 9   Density     678013 non-null  int64  
 10  Region      678013 non-null  object 
dtypes: float64(1), int64(6), object(4)
memory usage: 56.9+ MB


In [5]:
df['Area'].value_counts()

Area
C    191880
D    151596
E    137167
A    103957
B     75459
F     17954
Name: count, dtype: int64

In [16]:
df.columns

Index(['ClaimNb', 'Exposure', 'Area', 'VehPower', 'VehAge', 'DrivAge',
       'BonusMalus', 'VehBrand', 'VehGas', 'Density', 'Region'],
      dtype='object')

In [6]:
cat_cols = df.select_dtypes(include='object').columns.to_list()
target = 'ClaimNb'
num_cols = df.select_dtypes(exclude='object').columns.drop(target).to_list()
base = ['Exposure', 'Area', 'VehPower', 'VehAge', 'DrivAge',
       'BonusMalus', 'VehBrand', 'VehGas', 'Density', 'Region']

In [7]:
print(num_cols)

['Exposure', 'VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density']


In [4]:
df.shape

(678013, 12)

In [7]:
df['ClaimNb'].value_counts()

ClaimNb
0     643953
1      32178
2       1784
3         82
4          7
11         3
5          2
6          1
8          1
16         1
9          1
Name: count, dtype: int64

### Missing any values

In [9]:
df.isnull().sum()

ClaimNb       0
Exposure      0
Area          0
VehPower      0
VehAge        0
DrivAge       0
BonusMalus    0
VehBrand      0
VehGas        0
Density       0
Region        0
dtype: int64

### There is no missing values in the dataset

In [8]:
df.describe()

Unnamed: 0,ClaimNb,Exposure,VehPower,VehAge,DrivAge,BonusMalus,Density
count,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0
mean,0.053247,0.52875,6.454631,7.044265,45.499122,59.761502,1792.422405
std,0.240117,0.364442,2.050906,5.666232,14.137444,15.636658,3958.646564
min,0.0,0.002732,4.0,0.0,18.0,50.0,1.0
25%,0.0,0.18,5.0,2.0,34.0,50.0,92.0
50%,0.0,0.49,6.0,6.0,44.0,50.0,393.0
75%,0.0,0.99,7.0,11.0,55.0,64.0,1658.0
max,16.0,2.01,15.0,100.0,100.0,230.0,27000.0


# Feature Engineering

### We will also simplify the data for our GLM model. In particular, we will adjust the following columns:

- ClaimNb: cap at 4 claims
- VehAge: cap at 20 years
- DrivAge: cap at 90 years old
- BonusMalus: cap at 150, round to nearest integer
- Density: apply log
- Exposure: cap at 1 year

and subsequently add the following 'GLM' columns that will be used for model fitting:
- AreaGLM: convert alphabet into integer
- VehPowerGLM: cap at 9
- VehAgeGLM: create 3 bins
- DrivAgeGLM: create 7 bins


One of the reasons for capping variables is that insurance claim counts are highly skewed: most policyholders have 0-2 claims, but a few may have 10+. Extreme values like 20 claims are rare, often due to data errors, fraud, etc. 

In [10]:
def fe(df):
    
    df_freq = df.copy()

    # Clipping/ capping 
    
    df_freq['ClaimNb'] = df_freq['ClaimNb'].clip(upper=4)
    df_freq['VehAge'] = df_freq['VehAge'].clip(upper=20)
    df_freq['DrivAge'] = df_freq['DrivAge'].clip(upper=90)
    df_freq['Exposure'] = df_freq['Exposure'].clip(upper=1.0)

    # Conditional capping
    df_freq['BonusMalus'] = np.where(df_freq['BonusMalus'] > 150, 150, df_freq['BonusMalus']).astype(int)
    df_freq['VehPowerGLM']= np.where(df_freq['VehPower'] > 9, 9, df_freq['VehPower']).astype(str)

    # Log transform
    df_freq['Log_Density'] = np.log1p(df_freq['Density'])
    
    df_freq['AreaGLM']=df_freq['Area'].apply(lambda x: ord(x)-64)
    
    df_freq['VehPowerGLM']=df_freq['VehPowerGLM'].apply(lambda x: str(x))
    df_freq['VehAgeGLM']=pd.cut(df_freq['VehAge'], bins=[0,1,10,np.inf], labels=[1,2,3], include_lowest=True)
    df_freq['DrivAgeGLM']=pd.cut(df_freq['DrivAge'], bins=[18,21,26,31,41,51,71,np.inf], labels=[1,2,3,4,5,6,7], include_lowest=True)
    df_freq['BonusMalusGLM'] = df_freq['BonusMalus']
    df_freq['DensityGLM'] = df_freq['Density']

    original_cols = df.columns.tolist()

    fe_cols = [c for c in df_freq.columns if c not in original_cols]

    return df_freq, fe_cols

df_freq, fe_cols = fe(df)
print(fe_cols)

['VehPowerGLM', 'Log_Density', 'AreaGLM', 'VehAgeGLM', 'DrivAgeGLM', 'BonusMalusGLM', 'DensityGLM']


In [11]:
df_freq.columns

Index(['ClaimNb', 'Exposure', 'Area', 'VehPower', 'VehAge', 'DrivAge',
       'BonusMalus', 'VehBrand', 'VehGas', 'Density', 'Region', 'VehPowerGLM',
       'Log_Density', 'AreaGLM', 'VehAgeGLM', 'DrivAgeGLM', 'BonusMalusGLM',
       'DensityGLM'],
      dtype='object')

In [26]:
df.columns

Index(['ClaimNb', 'Exposure', 'Area', 'VehPower', 'VehAge', 'DrivAge',
       'BonusMalus', 'VehBrand', 'VehGas', 'Density', 'Region'],
      dtype='object')

## Count Encoding

In [14]:
ce_cols = []
for col in cat_cols:
    new_cols = f'{col}_CE'
    df_freq[new_cols] = df_freq[col].map(df[col].value_counts())
    ce_cols.append(new_cols)

In [15]:
print(ce_cols)

['Area_CE', 'VehBrand_CE', 'VehGas_CE', 'Region_CE']


In [16]:
df_freq.head()

Unnamed: 0,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,...,Log_Density,AreaGLM,VehAgeGLM,DrivAgeGLM,BonusMalusGLM,DensityGLM,Area_CE,VehBrand_CE,VehGas_CE,Region_CE
0,1,0.1,D,5,0,55,50,B12,Regular,1217,...,7.104965,4,1,6,50,1217,151596,166024,345877,84752
1,1,0.77,D,5,0,55,50,B12,Regular,1217,...,7.104965,4,1,6,50,1217,151596,166024,345877,84752
2,1,0.75,B,6,2,52,50,B12,Diesel,54,...,4.007333,2,2,6,50,54,75459,166024,332136,7994
3,1,0.09,B,7,0,46,50,B12,Diesel,76,...,4.343805,2,1,5,50,76,75459,166024,332136,31329
4,1,0.84,B,7,0,46,50,B12,Diesel,76,...,4.343805,2,1,5,50,76,75459,166024,332136,31329


In [17]:
df_freq.describe()

Unnamed: 0,ClaimNb,Exposure,VehPower,VehAge,DrivAge,BonusMalus,Density,Log_Density,AreaGLM,BonusMalusGLM,DensityGLM,Area_CE,VehBrand_CE,VehGas_CE,Region_CE
count,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0
mean,0.053179,0.528545,6.454631,6.976124,45.496871,59.757211,1792.422405,5.992367,3.289698,59.757211,1792.422405,140760.622807,126506.501998,339145.74149,76253.964681
std,0.237954,0.364081,2.050906,5.398963,14.129872,15.607906,3958.646564,1.856253,1.382685,15.607906,3958.646564,43012.131717,59046.34044,6869.093947,52875.688439
min,0.0,0.002732,4.0,0.0,18.0,50.0,1.0,0.693147,1.0,50.0,1.0,17954.0,4047.0,332136.0,1326.0
25%,0.0,0.18,5.0,2.0,34.0,50.0,92.0,4.532599,2.0,50.0,92.0,103957.0,53395.0,332136.0,35805.0
50%,0.0,0.49,6.0,6.0,44.0,50.0,393.0,5.976351,3.0,50.0,393.0,151596.0,159861.0,345877.0,69791.0
75%,0.0,0.99,7.0,11.0,55.0,64.0,1658.0,7.41397,4.0,64.0,1658.0,191880.0,162736.0,345877.0,84752.0
max,4.0,1.0,15.0,20.0,90.0,150.0,27000.0,10.203629,6.0,150.0,27000.0,191880.0,166024.0,345877.0,160601.0


In [None]:
df_freq.columns

In [19]:
features = base + fe_cols + ce_cols
print(len(features))

21
