In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score

# Assume df is your DataFrame
df = pd.read_csv('../data/insurance_data_clean.csv',low_memory=False)
# 1. Claim Severity Prediction (Regression on TotalClaims where TotalClaims > 0)
severity_df = df[df['TotalClaims'] > 0].copy()
X_severity = severity_df.drop(['TotalClaims', 'CalculatedPremiumPerTerm'], axis=1)
y_severity = severity_df['TotalClaims']

# Encode categorical variables
X_severity_encoded = pd.get_dummies(X_severity, drop_first=True)

# Replace inf/-inf with nan, then fill or drop
X_severity_encoded = X_severity_encoded.replace([np.inf, -np.inf], np.nan)
X_severity_encoded = X_severity_encoded.fillna(0)

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_severity_encoded, y_severity, test_size=0.2, random_state=42)
severity_model = RandomForestRegressor(random_state=42)
severity_model.fit(X_train_s, y_train_s)
y_pred_s = severity_model.predict(X_test_s)

rmse_severity = np.sqrt(mean_squared_error(y_test_s, y_pred_s))
r2_severity = r2_score(y_test_s, y_pred_s)

print(f"Claim Severity Model RMSE: {rmse_severity:.2f}")
print(f"Claim Severity Model R^2: {r2_severity:.2f}")

# 2. Claim Probability Prediction (Classification)
df['has_claim'] = (df['TotalClaims'] > 0).astype(int)
X_prob = df.drop(['TotalClaims', 'CalculatedPremiumPerTerm', 'has_claim'], axis=1)
y_prob = df['has_claim']

# Encode categorical variables for probability model
X_prob_encoded = pd.get_dummies(X_prob, drop_first=True)
X_prob_encoded = X_prob_encoded.replace([np.inf, -np.inf], np.nan)
X_prob_encoded = X_prob_encoded.fillna(0)

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_prob_encoded, y_prob, test_size=0.2, random_state=42)
prob_model = RandomForestClassifier(random_state=42)
prob_model.fit(X_train_p, y_train_p)
y_pred_prob = prob_model.predict_proba(X_test_p)[:, 1]

auc_prob = roc_auc_score(y_test_p, y_pred_prob)
print(f"Claim Probability Model ROC-AUC: {auc_prob:.2f}")

# 3. Premium Calculation Example (without expense loading/profit margin)
# For demonstration, use test set predictions
predicted_severity = severity_model.predict(X_test_p)
predicted_probability = y_pred_prob
risk_based_premium = predicted_probability * predicted_severity

# Add expense loading and profit margin as needed:
expense_loading = 100  # Example fixed value
profit_margin = 0.10   # 10% margin

final_premium = risk_based_premium + expense_loading
final_premium = final_premium * (1 + profit_margin)

# Output sample
premium_df = pd.DataFrame({
    'Predicted_Probability': predicted_probability,
    'Predicted_Severity': predicted_severity,
    'Risk_Based_Premium': risk_based_premium,
    'Final_Premium': final_premium
})
premium_df.head()

Claim Severity Model RMSE: 8508.62
Claim Severity Model R^2: 0.95


KeyboardInterrupt: 