In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score
from xgboost import XGBClassifier
import joblib

In [2]:
df_data = pd.read_excel("data/synthetic_ltd_claims_soa_expanded.xlsx")

In [3]:
df_data 

Unnamed: 0,policy_id,age,gender,state,industry,occupation_class,salary_band,elimination_period_days,benefit_pct,coverage_type,...,ssdi_offset_indicator,hours_worked_per_week,employment_status,exposure_year,coverage_start_year,claim_incident,incurred_year,claim_duration_months,recovery_status,return_to_work_flag
0,1,58,M,MA,Healthcare,4,60-80k,90,0.6,EmployerPaid,...,0,40,FullTime,2019,2018,0,,0,,0
1,2,48,F,RI,Services,1,40-60k,30,0.6,EmployerPaid,...,0,40,FullTime,2020,2018,0,,0,,0
2,3,34,F,NY,Services,3,40-60k,90,0.6,EmployerPaid,...,0,40,FullTime,2019,2011,0,,0,,0
3,4,62,F,NJ,Education,2,80-120k,90,0.7,EmployerPaid,...,0,35,FullTime,2019,2011,0,,0,,0
4,5,27,F,RI,Retail,3,80-120k,90,0.6,EmployerPaid,...,0,40,FullTime,2019,2017,0,,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,35,M,ME,Healthcare,3,60-80k,180,0.7,EmployerPaid,...,0,40,FullTime,2022,2014,0,,0,,0
9996,9997,64,M,VT,Manufacturing,3,>120k,90,0.6,EmployerPaid,...,0,25,PartTime,2020,2015,0,,0,,0
9997,9998,34,M,VT,Education,3,60-80k,90,0.6,Voluntary,...,0,35,FullTime,2019,2011,0,,0,,0
9998,9999,20,M,ME,Retail,1,<40k,30,0.6,EmployerPaid,...,0,40,FullTime,2022,2018,0,,0,,0


##### Review of Year-Based Columns (Transformation vs. Drop Decisions)

exposure_year
→ Indicates the calendar period the life is exposed. Useful for creating a new COVID-era feature. Keep for transformation, not as a raw predictor.

coverage_start_year
→ Represents when coverage began. Effectively duplicates years_with_employer. Adds no new signal → can be dropped.

incurred_year
→ Only populated for claims (incident = 1). Not available at prediction time for non-claim lives → must be dropped for modeling.

In [4]:
df_data['is_covid_period'] = (df_data['exposure_year'] >= 2020).astype(int)

In [5]:
df_data_cat_cols = ['gender', 'state', 'industry', 'benefit_duration',
       'salary_band','coverage_type','integration_type','ssdi_offset_indicator','employment_status','employment_status','is_covid_period']

df_data_cont_cols = ['age','occupation_class','elimination_period_days','benefit_pct','years_with_employer','max_monthly_benefit','hours_worked_per_week']


#### - Model building

"recovery_status", "claim_duration_months", "return_to_work_flag" → These fields describe outcomes after a claim, so they are not valid predictors. Dropping them.

In [6]:
X = df_data.drop(columns=['policy_id','claim_incident','incurred_year','coverage_start_year','exposure_year','return_to_work_flag','claim_duration_months','recovery_status'])
y = df_data['claim_incident']

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [8]:
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), df_data_cat_cols),
        ("num", "passthrough", df_data_cont_cols)
    ]
)

In [9]:
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / pos
scale_pos_weight

125.98412698412699

In [31]:
model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.001,
    subsample=0.6,
    colsample_bytree=0.6,
    scale_pos_weight=scale_pos_weight,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

clf = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", model)
    ]
)


In [32]:
clf.fit(X_train, y_train)

#### - Evaluate the model

In [33]:
y_proba = clf.predict_proba(X_test)[:,1]
y_proba

array([0.45471787, 0.42687693, 0.46745557, ..., 0.40507635, 0.45402825,
       0.43189546], dtype=float32)

In [38]:
y_pred = (y_proba >= 0.5).astype(int)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [39]:
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

ROC-AUC: 0.7050151209677419

Classification report:
               precision    recall  f1-score   support

           0       0.99      0.89      0.94      1984
           1       0.02      0.25      0.03        16

    accuracy                           0.88      2000
   macro avg       0.51      0.57      0.49      2000
weighted avg       0.99      0.88      0.93      2000


Confusion matrix:
 [[1764  220]
 [  12    4]]


In [41]:
joblib.dump(clf, "model/ltd_baseline_model_pipeline.pkl")

['model/ltd_baseline_model_pipeline.pkl']