In [None]:
# notebooks/03_model_development.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sys
import os
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data_tools import load_processed_data
from models import (
    prepare_features_for_modeling,
    train_linear_regression_per_zipcode,
    train_and_evaluate_ml_model,
    get_feature_importances,
    save_model,
    load_model,
    MODELS_DIR # Import constants from models.py or utils.py
)
from utils import PROCESSED_DATA_PATH # For DVC tracking, ensure path matches

# --- Load Processed Data ---
df = load_processed_data()

if df is None:
    print("Cannot proceed with ML Modeling: Processed data not loaded.")
else:
    # --- Part 1: Linear Regression Model per Zipcode for Total Claims ---
    print("\n--- Fitting Linear Regression Models per Zipcode ---")
    # Make sure 'total_claims' and 'postal_code' are available after preprocessing
    zipcode_models = train_linear_regression_per_zipcode(df.copy(), target_col='total_claims')
    print(f"Trained models for {len(zipcode_models)} unique zipcodes.")

    # You could now use these models to predict claims for new data within specific zipcodes.
    # E.g., model_for_1459 = zipcode_models.get(1459)
    # if model_for_1459:
    #     sample_data = pd.DataFrame(...) # new car features for zipcode 1459
    #     predicted_claim = model_for_1459.predict(sample_data)


    # --- Part 2: Develop a Machine Learning Model to Predict Optimal Premium Values ---
    # Assuming 'total_premium' as the target for 'optimal premium values' (this might need refinement based on business logic)
    # Or, if optimal premium means profit maximization, you might predict risk (claims) and then set premium.
    # For this exercise, let's target `total_premium` as the 'optimal premium' or a proxy.

    target_for_ml = 'total_premium' # Or 'total_claims' if you're predicting risk

    # Prepare features and target
    X, y, preprocessor, numerical_features, categorical_features = prepare_features_for_modeling(df.copy())

    if X is None or y is None:
        print("Feature preparation failed. Cannot train ML model.")
    else:
        print(f"\n--- Training Overall ML Model for {target_for_ml} ---")
        # Train and evaluate a RandomForestRegressor
        ml_pipeline, X_train_processed, y_train_ml = train_and_evaluate_ml_model(
            X, y, preprocessor, model_type='RandomForest'
        )

        # Report on the explaining power of the important features
        if ml_pipeline.named_steps['regressor'].__class__.__name__ == 'RandomForestRegressor':
            feature_importances_df = get_feature_importances(ml_pipeline, numerical_features, categorical_features)
        else:
            print("Feature importances not available for this model type.")

        # --- Save the trained ML Model ---
        model_save_path = os.path.join(MODELS_DIR, f'alpha_care_{target_for_ml}_predictor.pkl')
        save_model(ml_pipeline, model_name=f'alpha_care_{target_for_ml}_predictor.pkl', path=model_save_path)

        # --- Example: Load and test the saved model ---
        print("\n--- Testing Model Loading ---")
        loaded_model = load_model(path=model_save_path)
        if loaded_model:
            print("Model loaded successfully. Ready for deployment/inference.")

    print("\n--- ML Modeling Complete. ---")