                STATISTICAL MODELING FOR INSURNACE DATA

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
#import xgboost
#import shap
#import lime
#import lime.lime_tabular




file_path = '../data/MachineLearningRating_v3.txt'

df = pd.read_csv(file_path, sep='|')

# Explore the dataset
print(df.head())

In [4]:
# Define target and features
X = df.drop(['TotalPremium', 'TotalClaims'], axis=1)
y = df[['TotalPremium', 'TotalClaims']]

# Handling missing data and feature engineering
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numeric data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# Ensure consistent data types in columns
for col in X.columns:
    if X[col].dtype == 'object':
        
        try:
            X[col] = X[col].astype(float) 
        except ValueError:
            pass 

X[categorical_features] = X[categorical_features].astype(str)

In [None]:
import xgboost as xgb

# Define target and features
X = df.drop(['TotalPremium', 'TotalClaims'], axis=1)  # Feature variables
y = df[['TotalPremium', 'TotalClaims']]               # Target variables

for col in X.columns:
    if X[col].dtype == 'object':
        
        try:
            X[col] = X[col].astype(float)
        except ValueError:
            pass  

# Make sure all categorical features are treated as strings
X[categorical_features] = X[categorical_features].astype(str)

# Handling missing data and feature engineering
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numeric data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),    
    ('scaler', StandardScaler())                     
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  
])

# Combine preprocessing steps into one ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define models and create pipelines for each
models = {
    'Linear Regression': Pipeline(steps=[('preprocessor', preprocessor),
                                         ('regressor', LinearRegression())]),
    
    'Random Forest': Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', RandomForestRegressor(n_estimators=100))]),
    
    'XGBoost': Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', xgb.XGBRegressor(objective='reg:squarederror'))])
}

# Train models and evaluate
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    results[name] = score
    print(f'{name} R^2 score: {score:.4f}')

# Display final model results
print("\nModel Results:", results)

In [None]:
print (X_train.dtypes)

In [None]:
from sklearn.metrics import mean_squared_error

# Evaluate models
for name, model in models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f'{name} Mean Squared Error: {mse:.4f}')

In [None]:
importances = {}
for name, model in models.items():
    if name in ['Random Forest', 'XGBoost']:
        # Extract feature importances
        model.fit(X_train, y_train)  # Re-fit to ensure feature importance is available
        importances[name] = model.named_steps['regressor'].feature_importances_

# For SHAP values (Example with XGBoost)
explainer = shap.Explainer(models['XGBoost'].named_steps['regressor'])
shap_values = explainer(X_test)

# Plot SHAP values
shap.summary_plot(shap_values, X_test)

In [None]:
import matplotlib.pyplot as plt

# Plotting feature importances
plt.figure(figsize=(10, 6))
for name, importance in importances.items():
    plt.plot(importance, label=name)
plt.title('Feature Importances')
plt.xlabel('Feature Index')
plt.ylabel('Importance')
plt.legend()
plt.show()

# Print overall comparison
print("Model Comparison Report:")
for name, score in results.items():
    print(f"{name}: R^2 score: {score:.4f}")