In [45]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from sklearn.metrics import mean_squared_error, r2_score

import shap
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [47]:
# Load data
df = pd.read_csv("../MachineLearningRating_v3.txt", sep='|', low_memory=False)

categorical_cols = [
    'TransactionMonth', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank',
    'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'MainCrestaZone',
    'SubCrestaZone', 'ItemType', 'VehicleType', 'make', 'Model', 'bodytype',
    'VehicleIntroDate', 'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding',
    'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 'TermFrequency',
    'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product',
    'StatutoryClass', 'StatutoryRiskType'
]

numerical_cols = [
    'UnderwrittenCoverID', 'PolicyID', 'PostalCode', 'mmcode', 'RegistrationYear',
    'Cylinders', 'cubiccapacity', 'kilowatts', 'NumberOfDoors', 'CustomValueEstimate',
    'NumberOfVehiclesInFleet', 'SumInsured', 'CalculatedPremiumPerTerm', 'TotalPremium'
]

target_col = 'TotalClaims'

df = df.dropna(subset=[target_col])

X = df[categorical_cols + numerical_cols]
y = df[target_col].astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [48]:
numerical_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])


In [43]:
# Preprocessing pipeline for numerical columns: impute missing with mean
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing pipeline for categorical columns: impute missing with constant and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])


In [49]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"RMSE: {rmse:.4f}")
    print(f"R^2: {r2:.4f}")
    return y_pred


In [50]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42, n_estimators=100),
    "XGBoost": xgb.XGBRegressor(objective='reg:squarederror', random_state=42, n_estimators=100)
}

fitted_models = {}
predictions = {}

for name, model in models.items():
    print(f"Training {name}...")
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y_train)
    print(f"Evaluating {name}:")
    preds = evaluate_model(pipeline, X_test, y_test)
    fitted_models[name] = pipeline
    predictions[name] = preds
    print("="*40)


Training Linear Regression...




Evaluating Linear Regression:




RMSE: 2204.2326
R^2: 0.0057
Training Decision Tree...




Evaluating Decision Tree:




RMSE: 3257.5545
R^2: -1.1717
Training Random Forest...




KeyboardInterrupt: 

In [51]:
def plot_feature_importance(model_pipeline, model_name):
    # Get the trained model
    model = model_pipeline.named_steps['regressor']
    
    # Get feature names after preprocessing
    ohe = model_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
    cat_features = ohe.get_feature_names_out(categorical_cols)
    feature_names = numerical_cols + list(cat_features)
    
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        fi = pd.Series(importances, index=feature_names).sort_values(ascending=False).head(20)
        fi.plot(kind='barh')
        plt.title(f"Top 20 Feature Importances - {model_name}")
        plt.show()
    else:
        print(f"{model_name} does not support feature_importances_")
        
# Example for Random Forest
plot_feature_importance(fitted_models['Random Forest'], "Random Forest")


KeyError: 'Random Forest'

In [None]:
explainer = shap.Explainer(fitted_models['XGBoost'].named_steps['regressor'])
# We need to preprocess test data first:
X_test_preprocessed = fitted_models['XGBoost'].named_steps['preprocessor'].transform(X_test)
shap_values = explainer(X_test_preprocessed)

shap.summary_plot(shap_values, features=X_test_preprocessed, feature_names=None)


In [None]:
# We use training data in numpy after preprocessing to init Lime explainer:
X_train_preprocessed = fitted_models['Random Forest'].named_steps['preprocessor'].transform(X_train)

lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train_preprocessed,
    feature_names=numerical_cols + list(fitted_models['Random Forest'].named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)),
    mode='regression'
)

# Explain a single prediction
i = 0
exp = lime_explainer.explain_instance(
    X_test_preprocessed[i].toarray() if hasattr(X_test_preprocessed[i], 'toarray') else X_test_preprocessed[i],
    fitted_models['Random Forest'].predict,
    num_features=10
)

exp.show_in_notebook(show_table=True)
