# Average premiumn predictor

Notebook‑ready script that trains a model to predict the **average** home‑insurance
premium per customer from a table of Canadian carrier quotes.

▸ Feature logic matches the hybrid scheme we discussed:
    • High‑card: postal FSA & city   → target‑encoded (CV‑safe)
    • Low‑card / binary flags       → one‑hot
    • Numeric & engineered fields   → passthrough
    • Apartment indicator           → custom FunctionTransformer

Change only the CSV path and (optionally) the customer‑ID recipe if
your dataset already contains a unique identifier.

## 0 ▸ Imports & setup

In [1]:
import numpy as np
import pandas as pd
import random
from pathlib import Path
from datetime import datetime

# machine learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.svm import SVR
# from xgboost import XGBRegressor # may need to brew install libomp for this to import
# from lightgbm import LGBMRegressor

# target encoding
from category_encoders.target_encoder import TargetEncoder

# persist model
import joblib

## 1 ▸ Load raw quote table

In [2]:
df = pd.read_csv('../output/sample_data_54.csv')

# optional: trim column names
df.columns = df.columns.str.strip()

## 2 ▸ Aggregate to one row per customer (average premium target)

In [3]:
feature_keep = [
    # raw categorical / numeric fields to engineer / encode
    "postal_code", "city", "province", "insurance_type", "unit_apt",
    "date_of_birth", "move_in_year",
    "num_fire_extinguishers", "num_mortgages", "occupants",
    "num_claims", "num_cancellations",
    # binary flags
    "has_sprinkler_system", "occupants_non_smokers",
    "active_home_insurance", "ever_insured", "multiline_discount",
    "has_monitored_fire_alarm", "has_deadbolt_locks",
    "has_monitored_burglar_alarm",
]

df_avg = (df
          .groupby("id", as_index=False)
          .agg({**{c: "first" for c in feature_keep},
                "annual_premium": "mean"})
          .rename(columns={"annual_premium": "avg_premium"}))

In [4]:
df_avg.describe()

Unnamed: 0,unit_apt,move_in_year,num_fire_extinguishers,num_mortgages,occupants,num_claims,num_cancellations,avg_premium
count,19.0,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,602.526316,2016.292683,1.195122,1.121951,2.073171,1.195122,0.926829,4569.66324
std,247.260629,4.697041,0.95445,0.87164,0.787091,0.813004,0.818237,2537.345026
min,4.0,2009.0,0.0,0.0,1.0,0.0,0.0,1938.5
25%,456.0,2013.0,0.0,0.0,1.0,1.0,0.0,3283.0
50%,682.0,2016.0,2.0,1.0,2.0,1.0,1.0,3872.0
75%,758.5,2020.0,2.0,2.0,3.0,2.0,2.0,4948.0
max,897.0,2024.0,3.0,2.0,3.0,2.0,2.0,14590.0


## 3 ▸ Feature engineering helpers

In [5]:
THIS_YEAR = datetime.now().year 

# Age & years in dwelling
df_avg["age"] = THIS_YEAR - pd.to_datetime(df_avg["date_of_birth"]).dt.year

df_avg["years_in_dwelling"] = THIS_YEAR - df_avg["move_in_year"].fillna(THIS_YEAR)

# Log‑transforms to soften heavy tails
for col in ["num_claims", "num_cancellations"]:
    df_avg[f"log_{col}"] = np.log1p(df_avg[col])

# Postal FSA (first 3 chars of postal code) 
df_avg["postal_fsa"] = df_avg["postal_code"].str[:3].str.upper()

## 4 ▸ Column lists for the preprocessing pipeline

In [6]:
# High‑cardinality → CV‑safe target encoding
te_cols = ["postal_fsa", "city"]

# Low/medium‑cardinality categoricals → one‑hot
cat_cols = [
    "province", "insurance_type",
    "has_sprinkler_system", "occupants_non_smokers",
    "active_home_insurance", "ever_insured", "multiline_discount",
    "has_monitored_fire_alarm", "has_deadbolt_locks",
    "has_monitored_burglar_alarm",
]

# Numeric features → passthrough
num_cols = [
    "age", "years_in_dwelling", "num_fire_extinguishers",
    "num_mortgages", "occupants", "log_num_claims",
    "log_num_cancellations",
]

In [7]:
# Helper to derive the house/apt flag inside the pipeline
def make_not_apartment(X):
    """Return 1 if *no* unit identifier → standalone dwelling; else 0.
    
    Parameters:
        X (DataFrame): Single-column DataFrame containing the unit_apt column
    Returns:
        DataFrame: Single-column DataFrame with 1 for houses (no unit number) and 0 for apartments
    """
    unit_apt_col = X.iloc[:, 0]  # We know this is the unit_apt column due to ColumnTransformer setup
    return (unit_apt_col.isna() | (unit_apt_col.astype(str).str.strip() == "")).astype(int).to_frame()

## 5 ▸ Assembling the preprocessing pipeline

In [8]:
preprocess = ColumnTransformer(
    transformers=[
        ("te", TargetEncoder(smoothing=0.3), te_cols),  # removed cv parameter and cols parameter
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
        ("not_apt", FunctionTransformer(make_not_apartment, validate=False), ["unit_apt"]),
    ],
    remainder="drop",
)

## 6 ▸ Train/validation split & model training

In [9]:
# Prepare data
X = df_avg.drop(columns=["avg_premium", "id"])
y = df_avg["avg_premium"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=df_avg["province"]
)

# Define models to test
models = {
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    # 'XGBoost': XGBRegressor(random_state=42, verbosity=0),
    # 'LightGBM': LGBMRegressor(random_state=42, verbose=-1),
    'Lasso': LassoCV(cv=5, random_state=42),
    'Ridge': RidgeCV(cv=5),
    'SVR': SVR(kernel='rbf')
}

# Dictionary to store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    pipe = Pipeline([("prep", preprocess), ("reg", model)])
    
    # Train the model
    pipe.fit(X_train, y_train)
    
    # Store the pipeline for later use
    results[name] = {
        'pipeline': pipe,
        'train_score': mean_absolute_error(y_train, pipe.predict(X_train)),
        'test_score': mean_absolute_error(y_test, pipe.predict(X_test))
    }

# Create performance summary
performance_df = pd.DataFrame({
    name: {
        'Train MAE': res['train_score'],
        'Test MAE': res['test_score']
    }
    for name, res in results.items()
}).round(2)

print("\nModel Performance Summary (MAE in CAD):")
print(performance_df.T)


Training Gradient Boosting...

Training Random Forest...

Training Lasso...

Training Ridge...

Training SVR...

Model Performance Summary (MAE in CAD):
                   Train MAE  Test MAE
Gradient Boosting      63.29   2129.76
Random Forest         668.66   2047.92
Lasso                1510.00   2367.70
Ridge                1353.66   2224.24
SVR                  1361.49   1991.97


In [10]:
# Store the best model based on test score
best_model_name = min(results.items(), key=lambda x: x[1]['test_score'])[0]
pipe = results[best_model_name]['pipeline']
print(f"\nBest performing model: {best_model_name}")


Best performing model: SVR


## 7 ▸ Evaluation

In [11]:
cv_mae = -cross_val_score(pipe, X_train, y_train,
                         cv=5, scoring="neg_mean_absolute_error").mean()

holdout_mae = mean_absolute_error(y_test, pipe.predict(X_test))

print(f"\nCross‑val MAE:  {cv_mae:,.0f} CAD")
print(f"Hold‑out MAE:   {holdout_mae:,.0f} CAD\n")


Cross‑val MAE:  1,404 CAD
Hold‑out MAE:   1,992 CAD



## 8 ▸ Convenience inference wrapper

In [12]:
def predict_average_premium(profile: dict):
    """Return the average premium estimate for a *single* customer profile."""
    # Create DataFrame with single profile
    X_new = pd.DataFrame([profile])
    
    # Feature engineering
    # 1. Calculate age from date of birth
    X_new["age"] = THIS_YEAR - pd.to_datetime(X_new["date_of_birth"]).dt.year
    
    # 2. Calculate years in dwelling
    X_new["years_in_dwelling"] = THIS_YEAR - X_new["move_in_year"].fillna(THIS_YEAR)
    
    # 3. Log transform claims and cancellations
    X_new["log_num_claims"] = np.log1p(X_new["num_claims"])
    X_new["log_num_cancellations"] = np.log1p(X_new["num_cancellations"])
    
    # 4. Extract postal FSA (first 3 chars)
    X_new["postal_fsa"] = X_new["postal_code"].str[:3].str.upper()
    
    # Make prediction
    avg_prediction = pipe.predict(X_new)[0]
    
    # Calculate Senchi discount
    percent_discount = random.uniform(0.25, 0.40)
    senchi_prediction = avg_prediction * percent_discount
    discount = avg_prediction - senchi_prediction
    
    return {
        "avg_premium": avg_prediction,
        "senchi_premium": senchi_prediction,
        "percent_discount": percent_discount,
        "discount": discount
    }

## 9 ▸ Example usage

In [26]:
sample_profile = {
    "postal_code": "M9C 2T6",
    "city": "Toronto",
    "province": "ON",
    "insurance_type": "H",
    "unit_apt": "345",  # empty → house
    "date_of_birth": "2000-07-22",
    "move_in_year": 2015,
    "num_fire_extinguishers": 2,
    "num_mortgages": 1,
    "occupants": 3,
    "num_claims": 0,
    "num_cancellations": 0,
    "has_sprinkler_system": 0,
    "occupants_non_smokers": 0,
    "active_home_insurance": 0,
    "ever_insured": 1,
    "multiline_discount": 0,
    "has_monitored_fire_alarm": 0,
    "has_deadbolt_locks": 1,
    "has_monitored_burglar_alarm": 0,
}

In [27]:
result = predict_average_premium(sample_profile)
print(f"""
Premium Summary:
Average Market Premium:  ${result['avg_premium']:,.2f}
Senchi Premium:         ${result['senchi_premium']:,.2f}
Your Savings:           ${result['discount']:,.2f}
Discount Percentage:    {(result['percent_discount']) * 100:.1f}%
""")


Premium Summary:
Average Market Premium:  $4,007.33
Senchi Premium:         $1,160.71
Your Savings:           $2,846.62
Discount Percentage:    29.0%



## 10 ▸ Persist model

In [None]:
MODEL_PATH = Path("avg_home_premium_model.joblib")
joblib.dump(pipe, MODEL_PATH)
print(f"Model saved to {MODEL_PATH.resolve()}")