# Task 4: Machine Learning Modeling
## Insurance Risk Analytics - Predictive Modeling

### Objectives:
- Build predictive models for claim severity prediction
- Develop premium optimization models
- Evaluate and compare multiple ML algorithms
- Analyze feature importance using SHAP/LIME
- Provide business recommendations

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
import sys
sys.path.append('../')
from src.data_processing import DataProcessor
from src.models import LinearRegressionModel, RandomForestModel, XGBoostModel, ModelComparator

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load data
try:
    df = pd.read_csv('../data/insurance_data.csv', low_memory=False)
    print(f'Data loaded: {df.shape[0]} rows, {df.shape[1]} columns')
except FileNotFoundError:
    print('Data file not found. Please add your data to ../data/insurance_data.csv')

## Model 1: Claim Severity Prediction

Predict TotalClaims for policies that have claims (TotalClaims > 0)

In [None]:
# Prepare data for claim severity (policies with claims only)
processor = DataProcessor()
df_processed = processor.create_features(df)
df_processed = processor.handle_missing_values(df_processed)
df_with_claims = df_processed[df_processed['TotalClaims'] > 0].copy()

X_sev, y_sev = processor.prepare_for_modeling(df_with_claims, target_column='TotalClaims')
X_train_sev, X_test_sev, y_train_sev, y_test_sev = train_test_split(
    X_sev, y_sev, test_size=0.2, random_state=42
)
print(f'Training set: {X_train_sev.shape[0]} samples')
print(f'Test set: {X_test_sev.shape[0]} samples')

In [None]:
# Train models
models_sev = {
    'Linear Regression': LinearRegressionModel(),
    'Random Forest': RandomForestModel(n_estimators=100, random_state=42),
    'XGBoost': XGBoostModel(n_estimators=100, random_state=42)
}

print('Training models for Claim Severity Prediction...')
for name, model in models_sev.items():
    print(f'  Training {name}...')
    model.train(X_train_sev, y_train_sev)
print('All models trained successfully!')

In [None]:
# Evaluate models
comparator_sev = ModelComparator()
for model in models_sev.values():
    comparator_sev.add_model(model)

results_sev = comparator_sev.evaluate_all(X_test_sev, y_test_sev)
print('\n' + '='*80)
print('CLAIM SEVERITY PREDICTION - MODEL COMPARISON')
print('='*80)
print(results_sev.to_string(index=False))
print('='*80)

# Save results
results_sev.to_csv('../reports/claim_severity_model_results.csv', index=False)

## Model 2: Premium Optimization

Predict optimal premium values

In [None]:
# Prepare data for premium prediction
target = 'CalculatedPremiumPerTerm' if 'CalculatedPremiumPerTerm' in df_processed.columns else 'TotalPremium'
X_prem, y_prem = processor.prepare_for_modeling(df_processed, target_column=target)
X_train_prem, X_test_prem, y_train_prem, y_test_prem = train_test_split(
    X_prem, y_prem, test_size=0.2, random_state=42
)

models_prem = {
    'Linear Regression': LinearRegressionModel(),
    'Random Forest': RandomForestModel(n_estimators=100, random_state=42),
    'XGBoost': XGBoostModel(n_estimators=100, random_state=42)
}

print('Training models for Premium Optimization...')
for name, model in models_prem.items():
    model.train(X_train_prem, y_train_prem)

comparator_prem = ModelComparator()
for model in models_prem.values():
    comparator_prem.add_model(model)

results_prem = comparator_prem.evaluate_all(X_test_prem, y_test_prem)
print('\n' + '='*80)
print('PREMIUM OPTIMIZATION - MODEL COMPARISON')
print('='*80)
print(results_prem.to_string(index=False))
print('='*80)

# Save results
results_prem.to_csv('../reports/premium_optimization_model_results.csv', index=False)

## Feature Importance Analysis

In [None]:
# Get best model and feature importance
best_model = min(models_sev.values(), key=lambda m: m.evaluate(X_test_sev, y_test_sev)['RMSE'])
print(f'Best model: {best_model.model_name}')

if best_model.feature_importance_ is not None:
    top_features = best_model.feature_importance_.head(10)
    print('\nTop 10 Most Important Features:')
    print(top_features)
    
    # Visualize
    fig, ax = plt.subplots(figsize=(10, 6))
    top_features.plot(kind='barh', ax=ax, color='steelblue')
    ax.set_xlabel('Feature Importance', fontsize=11)
    ax.set_title(f'Top 10 Feature Importance - {best_model.model_name}', fontsize=12, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.savefig('../reports/figures/feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
# SHAP Analysis (if available)
try:
    import shap
    if isinstance(best_model, (RandomForestModel, XGBoostModel)):
        print('Computing SHAP values...')
        explainer = shap.TreeExplainer(best_model.model)
        shap_values = explainer.shap_values(X_test_sev.head(100))
        shap.summary_plot(shap_values, X_test_sev.head(100), show=False)
        plt.tight_layout()
        plt.savefig('../reports/figures/shap_summary.png', dpi=300, bbox_inches='tight')
        plt.show()
        print('SHAP analysis completed!')
    else:
        print('SHAP analysis available for tree-based models only.')
except ImportError:
    print('SHAP not installed. Install with: pip install shap')
except Exception as e:
    print(f'Error in SHAP analysis: {e}')

## Business Recommendations

In [None]:
print('\n' + '='*80)
print('BUSINESS RECOMMENDATIONS')
print('='*80)

best_sev = results_sev.loc[results_sev['RMSE'].idxmin()]
print(f'\n1. CLAIM SEVERITY PREDICTION:')
print(f'   Best Model: {best_sev["Model"]}')
print(f'   RMSE: {best_sev["RMSE"]:.2f}')
print(f'   R²: {best_sev["R2"]:.3f}')
print(f'   Interpretation: Model explains {best_sev["R2"]*100:.1f}% of variance in claim amounts.')

best_prem = results_prem.loc[results_prem['RMSE'].idxmin()]
print(f'\n2. PREMIUM OPTIMIZATION:')
print(f'   Best Model: {best_prem["Model"]}')
print(f'   RMSE: {best_prem["RMSE"]:.2f}')
print(f'   R²: {best_prem["R2"]:.3f}')
print(f'   Interpretation: Model explains {best_prem["R2"]*100:.1f}% of variance in premiums.')

if best_model.feature_importance_ is not None:
    top_3 = best_model.feature_importance_.head(3)
    print(f'\n3. TOP INFLUENTIAL FEATURES:')
    for feat, imp in top_3.items():
        print(f'   - {feat}: {imp:.4f}')
    print('\n   Recommendation: Focus pricing adjustments on these key factors.')

print('\n' + '='*80)