In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.linear_model import PoissonRegressor
from sklearn.metrics import mean_poisson_deviance, mean_squared_error

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

import xgboost as xgb
import math

import joblib
import pickle
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/french-motor-claims-datasets-fremtpl2freq/freMTPL2freq.csv


<div style="
  border-radius: 20px;
  padding: 25px;
  background: radial-gradient(circle at top, #cf1323, #000043);
  border: 2px solid #00ffc8;
  text-align: center;
  box-shadow: 0 0 25px rgba(0, 255, 250, 0.25);
">
  <h1 style="
    font-size: 28px;
    font-family: 'Trebuchet MS', sans-serif;
    letter-spacing: 2px;
    color: #00ffc8;
    text-shadow: 0 0 12px rgba(0,255,200,0.5);
  ">
    <span style="color:#7aa6ec5;">Load</span> <span style="color:#7aa6ec5;">the</span> <span style="color:#7aa6ec5;">DataFrame</span>
  </h1>
</div>

In [2]:
df = pd.read_csv('/kaggle/input/french-motor-claims-datasets-fremtpl2freq/freMTPL2freq.csv')
df = df.drop(['IDpol'], axis=1)

In [3]:
print('Dataset shape:', df.shape)
print('\nFirst 5 rows:')
df.head()

Dataset shape: (678013, 11)

First 5 rows:


Unnamed: 0,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region
0,1,0.1,D,5,0,55,50,B12,Regular,1217,R82
1,1,0.77,D,5,0,55,50,B12,Regular,1217,R82
2,1,0.75,B,6,2,52,50,B12,Diesel,54,R22
3,1,0.09,B,7,0,46,50,B12,Diesel,76,R72
4,1,0.84,B,7,0,46,50,B12,Diesel,76,R72


<div style="
  border-radius: 20px;
  padding: 25px;
  background: radial-gradient(circle at top, #cf1223, #000043);
  border: 2px solid #00ffc8;
  text-align: center;
  box-shadow: 0 0 25px rgba(0, 255, 200, 0.25);
">
  <h1 style="
    font-size: 28x;
    font-family: 'Trebuchet MS', sans-serif;
    letter-spacing: 2px;
    color: #00ffc8;
    text-shadow: 0 0 12px rgba(0,255,200,0.5);
  ">
    <span style="color:#7aa6ec5;">Exploratory</span> <span style="color:#7aa6ec2;">Data</span> <span style="color:#7aa6ec1;">Analysis</span>
  </h1>
</div>

The definitions of each column:

- ClaimNb: Claim numbers during exposure period
- Exposure: Period of exposure for a policy in year(s)
- VehPower: Car power, in categorical format
- VehAge: Car age, in continuous value format
- DrivAge: Driverâ€™s age in years, in continuous value format
- BonusMalus: Bonus / Malus rating, <100 means bonus, >100 means malus, in continous value format
- VehBrand: Car brand, in categorical format
- VechGas: Fuel type, in categorical format
- Area: Density rating of the area car driver lives in, in categorical format
- Density: Density rating, number of inhabitants per km^2, in continous value format
- Region: Policy region in France, in categorical format

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 678013 entries, 0 to 678012
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ClaimNb     678013 non-null  int64  
 1   Exposure    678013 non-null  float64
 2   Area        678013 non-null  object 
 3   VehPower    678013 non-null  int64  
 4   VehAge      678013 non-null  int64  
 5   DrivAge     678013 non-null  int64  
 6   BonusMalus  678013 non-null  int64  
 7   VehBrand    678013 non-null  object 
 8   VehGas      678013 non-null  object 
 9   Density     678013 non-null  int64  
 10  Region      678013 non-null  object 
dtypes: float64(1), int64(6), object(4)
memory usage: 56.9+ MB


In [5]:
df['Area'].value_counts()

Area
C    191880
D    151596
E    137167
A    103957
B     75459
F     17954
Name: count, dtype: int64

In [6]:
df.columns

Index(['ClaimNb', 'Exposure', 'Area', 'VehPower', 'VehAge', 'DrivAge',
       'BonusMalus', 'VehBrand', 'VehGas', 'Density', 'Region'],
      dtype='object')

In [7]:
target = 'ClaimNb'

In [8]:
df.shape

(678013, 11)

In [9]:
print("ClaimNb value counts:")
df['ClaimNb'].value_counts().sort_index()

ClaimNb value counts:


ClaimNb
0     643953
1      32178
2       1784
3         82
4          7
5          2
6          1
8          1
9          1
11         3
16         1
Name: count, dtype: int64

## Missing any values

In [10]:
df.isnull().sum()

ClaimNb       0
Exposure      0
Area          0
VehPower      0
VehAge        0
DrivAge       0
BonusMalus    0
VehBrand      0
VehGas        0
Density       0
Region        0
dtype: int64

#### There is no missing values in the dataset

In [11]:
df.describe()

Unnamed: 0,ClaimNb,Exposure,VehPower,VehAge,DrivAge,BonusMalus,Density
count,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0,678013.0
mean,0.053247,0.52875,6.454631,7.044265,45.499122,59.761502,1792.422405
std,0.240117,0.364442,2.050906,5.666232,14.137444,15.636658,3958.646564
min,0.0,0.002732,4.0,0.0,18.0,50.0,1.0
25%,0.0,0.18,5.0,2.0,34.0,50.0,92.0
50%,0.0,0.49,6.0,6.0,44.0,50.0,393.0
75%,0.0,0.99,7.0,11.0,55.0,64.0,1658.0
max,16.0,2.01,15.0,100.0,100.0,230.0,27000.0


# Feature Engineering

### We will also simplify the data for our GLM model. In particular, we will adjust the following columns:

- ClaimNb: cap at 4 claims
- VehAge: cap at 20 years
- DrivAge: cap at 90 years old
- BonusMalus: cap at 150, round to nearest integer
- Density: apply log
- Exposure: cap at 1 year

One of the reasons for capping variables is that insurance claim counts are highly skewed: most policyholders have 0-2 claims, but a few may have 10+. Extreme values like 20 claims are rare, often due to data errors, fraud, etc. 

In [12]:
df_freq = df.copy()

df_freq['ClaimNb'] = df_freq['ClaimNb'].clip(upper=4)
df_freq['DrivAge'] = df_freq['DrivAge'].clip(upper=90)
df_freq['Exposure'] = df_freq['Exposure'].clip(upper=1.0)

df_freq['VehAge_binned']=pd.cut(
    df_freq['VehAge'], 
    bins=[-np.inf,2, 5, 10, np.inf], 
    labels=['0-2 years', '3-5 years', '6-10 years', '10+ years'])

# Bin driver age into meaningful categories
df_freq['DrivAge_binned'] = pd.cut(
    df_freq['DrivAge'],
    bins=[17, 25, 35, 50, 65, 100],
    labels=['18-25', '26-35', '36-50', '51-65', '65+']
)

# Bin vehicle power (already categorical but we can group further)
df_freq['VehPower_group'] = pd.cut(
    df_freq['VehPower'],
    bins=[0, 6, 8, 10, 15],
    labels=['Low (<6)', 'Medium (6-7)', 'High (8-9)', 'Very High (10+)']
)

# Log transform
df_freq['Log_Density'] = np.log1p(df_freq['Density'])

df_freq['BonusMalus_deviation'] = df_freq['BonusMalus'] - 100


In [13]:
df_freq.columns

Index(['ClaimNb', 'Exposure', 'Area', 'VehPower', 'VehAge', 'DrivAge',
       'BonusMalus', 'VehBrand', 'VehGas', 'Density', 'Region',
       'VehAge_binned', 'DrivAge_binned', 'VehPower_group', 'Log_Density',
       'BonusMalus_deviation'],
      dtype='object')

In [14]:
df_freq.dtypes

ClaimNb                    int64
Exposure                 float64
Area                      object
VehPower                   int64
VehAge                     int64
DrivAge                    int64
BonusMalus                 int64
VehBrand                  object
VehGas                    object
Density                    int64
Region                    object
VehAge_binned           category
DrivAge_binned          category
VehPower_group          category
Log_Density              float64
BonusMalus_deviation       int64
dtype: object

In [15]:
cat_cols = ['Area', 'VehBrand', 'VehGas', 'Region','VehAge_binned', 'DrivAge_binned', 'VehPower_group']
num_cols = ['VehAge', 'DrivAge', 'BonusMalus', 'Log_Density', 'BonusMalus_deviation']

In [16]:
print(num_cols)

['VehAge', 'DrivAge', 'BonusMalus', 'Log_Density', 'BonusMalus_deviation']


In [17]:
df_freq.head()

Unnamed: 0,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region,VehAge_binned,DrivAge_binned,VehPower_group,Log_Density,BonusMalus_deviation
0,1,0.1,D,5,0,55,50,B12,Regular,1217,R82,0-2 years,51-65,Low (<6),7.104965,-50
1,1,0.77,D,5,0,55,50,B12,Regular,1217,R82,0-2 years,51-65,Low (<6),7.104965,-50
2,1,0.75,B,6,2,52,50,B12,Diesel,54,R22,0-2 years,51-65,Low (<6),4.007333,-50
3,1,0.09,B,7,0,46,50,B12,Diesel,76,R72,0-2 years,36-50,Medium (6-7),4.343805,-50
4,1,0.84,B,7,0,46,50,B12,Diesel,76,R72,0-2 years,36-50,Medium (6-7),4.343805,-50


## Preparing data for Modeling

In [18]:
features = cat_cols + num_cols
print(len(features))

12


In [19]:
X = df_freq[features]
y = df_freq['ClaimNb']
exposure = df_freq['Exposure']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Exposure shape: {exposure.shape}")

Features shape: (678013, 12)
Target shape: (678013,)
Exposure shape: (678013,)


## Train-test-split

In [20]:
# Split the data (stratify not needed for regression)
X_train, X_test, y_train, y_test, exposure_train, exposure_test = train_test_split(
    X, y, exposure, test_size=0.2, random_state=42
)

In [21]:
# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(), cat_cols)
    ])

# Fit preprocessor on training data
preprocessor.fit(X_train)

X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)


## Poisson Regression Modeling

In [22]:
poisson_model = PoissonRegressor(
    alpha = 1e-5,
    solver='newton-cholesky',
    max_iter=1200
).fit(X_train_processed, y_train, sample_weight=exposure_train)

In [23]:
y_pred_train = poisson_model.predict(X_train_processed)
y_pred_test = poisson_model.predict(X_test_processed)

# Since we used sample_weight, predictions are expected frequency per unit exposure
# Multiply by exposure to get expected claim count
expected_claims_train = y_pred_train * exposure_train
expected_claims_test = y_pred_test * exposure_test

print("Poisson Regression Results:")
print(f"Training Poisson deviance: {mean_poisson_deviance(y_train, y_pred_train, sample_weight=exposure_train):.4f}")
print(f"Test Poisson deviance: {mean_poisson_deviance(y_test, y_pred_test, sample_weight=exposure_test):.4f}")
print(f"Training MSE: {mean_squared_error(y_train, expected_claims_train):.4f}")
print(f"Test MSE: {mean_squared_error(y_test, expected_claims_test):.4f}")

Poisson Regression Results:
Training Poisson deviance: 0.3522
Test Poisson deviance: 0.3552
Training MSE: 0.0561
Test MSE: 0.0562


In [24]:
feature_names = []
feature_names.extend(num_cols)
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols)
feature_names.extend(cat_cols)

In [25]:
print(feature_names)

['VehAge', 'DrivAge', 'BonusMalus', 'Log_Density', 'BonusMalus_deviation', 'Area', 'VehBrand', 'VehGas', 'Region', 'VehAge_binned', 'DrivAge_binned', 'VehPower_group']


## Saving the model for download

In [26]:
with open('frequency_model3.pkl', 'wb') as f:
    pickle.dump(poisson_model, f)

# Save feature information
feature_info = {
    'categorical_features': cat_cols,
    'numerical_features': num_cols,
    'feature_names': feature_names,
    'baseline_rate': float(np.exp(poisson_model.intercept_)) if 'poisson_model' in dir() else 0.1
}

with open('feature_info3.pkl', 'wb') as f:
    pickle.dump(feature_info, f)

with open('preprocessor3.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

print("Models saved! Download these files for local development.")

Models saved! Download these files for local development.
