In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

In [2]:
data = pd.read_csv("canada_medical_insurance_forecast_detailed.csv")

In [3]:
data.columns

Index(['claim_id', 'date', 'year', 'month', 'quarter', 'member_id',
       'first_name', 'last_name', 'age', 'sex', 'province', 'employer_size',
       'plan_type', 'chronic_condition', 'risk_score', 'monthly_premium_cad',
       'claim_amount_cad', 'is_high_cost_claim', 'is_forecast',
       'forecast_lower_bound_cad', 'forecast_upper_bound_cad', 'loss_ratio',
       'age_group'],
      dtype='object')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528600 entries, 0 to 528599
Data columns (total 23 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   claim_id                  528600 non-null  object 
 1   date                      528600 non-null  object 
 2   year                      528600 non-null  int64  
 3   month                     528600 non-null  int64  
 4   quarter                   528600 non-null  int64  
 5   member_id                 528600 non-null  object 
 6   first_name                528600 non-null  object 
 7   last_name                 528600 non-null  object 
 8   age                       528600 non-null  int64  
 9   sex                       528600 non-null  object 
 10  province                  528600 non-null  object 
 11  employer_size             528600 non-null  object 
 12  plan_type                 528600 non-null  object 
 13  chronic_condition         196220 non-null  o

In [5]:
data.head()

Unnamed: 0,claim_id,date,year,month,quarter,member_id,first_name,last_name,age,sex,...,chronic_condition,risk_score,monthly_premium_cad,claim_amount_cad,is_high_cost_claim,is_forecast,forecast_lower_bound_cad,forecast_upper_bound_cad,loss_ratio,age_group
0,C00000001,2024-01-01,2024,1,1,M000379,Justin,Miller,57,Male,...,COPD,2.08,202.4,309.2,False,0,,,1.527668,50-64
1,C00000002,2024-01-01,2024,1,1,M014269,Nancy,Harris,61,Female,...,,1.3,231.0,303.67,False,0,,,1.314589,50-64
2,C00000003,2024-01-01,2024,1,1,M004159,Olivia,Baker,36,Female,...,,1.17,78.2,52.17,False,0,,,0.667136,35-49
3,C00000004,2024-01-01,2024,1,1,M006358,John,Morgan,18,Male,...,,0.99,165.6,150.68,False,0,,,0.909903,18-34
4,C00000005,2024-01-01,2024,1,1,M006358,John,Morgan,18,Male,...,,0.99,202.4,176.88,False,0,,,0.873913,18-34


In [None]:
from sklearn.model_selection import train_test_split
cols_to_drop = [
    "claim_amount_cad",
    "claim_id",
    "member_id",
    "first_name",
    "last_name",
    "date",
    "forecast_lower_bound_cad",
    "forecast_upper_bound_cad",
    "loss_ratio",
    "is_forecast"
]

hist = data[data["is_forecast"] == 0].copy()

X = hist.drop(cols_to_drop, axis=1)
y = hist["claim_amount_cad"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=100
)



In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ["sex", "province", "employer_size","plan_type", "chronic_condition", "age_group"]

numeric_cols = [col for col in X.columns if col not in categorical_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler


pipe_rf = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestRegressor(random_state=100))
])

pipe_xgb = Pipeline([
    ("preprocess", preprocess),
    ("model", XGBRegressor(
        objective="reg:squarederror",
        random_state=100
    ))
])

In [None]:
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold
from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_absolute_error, r2_score
param_lr = {
    "model__fit_intercept": [True, False]
}

param_rf = {
    "model__n_estimators": [200, 400],
    "model__max_depth": [10, 20, None],
    "model__min_samples_split": [2, 5],
    "model__min_samples_leaf": [1, 2]
}

param_xgb = {
    "model__n_estimators": [200, 400, 600],
    "model__learning_rate": [0.03, 0.05, 0.1],
    "model__max_depth": [3, 5, 7],
    "model__subsample": [0.8, 1.0],
    "model__colsample_bytree": [0.8, 1.0]
}

from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def grid_search(model, param_grid, X_train, y_train, X_test, y_test,
                n_splits=3, n_repeats=2, n_iter=15):

    cv = RepeatedKFold(
        n_splits=n_splits,      
        n_repeats=n_repeats,
        random_state=1000
    )

    gs = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=n_iter,
        cv=cv,
        scoring='neg_mean_squared_error',  
        n_jobs=-1,
        verbose=1,
        random_state=1000
    )

   
    gs.fit(X_train, y_train)

 
    best_model = gs.best_estimator_

    y_pred = best_model.predict(X_test)

    
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    try:
        model_name = best_model.named_steps['model'].__class__.__name__
    except (AttributeError, KeyError):
        model_name = best_model.__class__.__name__

    best_cv_mse = -gs.best_score_
    best_cv_rmse = best_cv_mse ** 0.5

    print("\n============================")
    print("Optimized Model:", model_name)
    print("Best CV Params:", gs.best_params_)
    print("Best CV Score (neg MSE):", gs.best_score_)
    print("Best CV RMSE (CV):", best_cv_rmse)
    print("Test RMSE:", rmse)
    print("Test MAE :", mae)
    print("Test R²  :", r2)
    print("============================\n")

    return best_model, gs






In [None]:
best_lr = grid_search(pipe_xgb,param_xgb, X_train, y_train, X_test, y_test)
best_rf = grid_search(pipe_rf,param_rf, X_train, y_train, X_test, y_test)



Fitting 6 folds for each of 15 candidates, totalling 90 fits

Optimized Model: XGBRegressor
Best CV Params: {'model__subsample': 0.8, 'model__n_estimators': 400, 'model__max_depth': 3, 'model__learning_rate': 0.03, 'model__colsample_bytree': 1.0}
Best CV Score (neg MSE): -11332.36189287508
Best CV RMSE (CV): 106.4535668396089
Test RMSE: 106.87891125537621
Test MAE : 82.16261399513309
Test R²  : 0.6647298799441992

Fitting 6 folds for each of 15 candidates, totalling 90 fits

Optimized Model: RandomForestRegressor
Best CV Params: {'model__n_estimators': 400, 'model__min_samples_split': 2, 'model__min_samples_leaf': 2, 'model__max_depth': 10}
Best CV Score (neg MSE): -11370.010892082886
Best CV RMSE (CV): 106.63025317461684
Test RMSE: 106.93494601803724
Test MAE : 82.14958544505798
Test R²  : 0.6643782351496164

