# California Housing - Model Development & Training

This notebook implements comprehensive model training and evaluation for California housing price prediction using our engineered features from Phase 3.

## Objectives
1. Load processed California housing data
2. Train multiple regression models (Linear, Ridge, Lasso, Random Forest, XGBoost)
3. Perform hyperparameter tuning for optimal performance
4. Evaluate and compare model performance
5. Analyze feature importance and model interpretability
6. Select best model for deployment
7. Save trained models for web application

## 1. Setup and Data Loading

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings
from datetime import datetime

# Custom modules
import sys
sys.path.append('..')

from src.model_training import CaliforniaHousingModelTrainer
from src.model_validation import CaliforniaHousingModelValidator
from src.data_pipeline import CaliforniaHousingPipeline

# Import model classes
from src.models.linear_models import *
from src.models.ensemble_models import *

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
warnings.filterwarnings('ignore')

print("🏠 California Housing Model Development Environment Setup Complete")
print(f"📅 Training session started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# Load processed California housing data
print("📊 Loading processed California housing data...")

try:
    # Try to load previously processed data
    pipeline = CaliforniaHousingPipeline(pd.DataFrame(), pd.DataFrame())
    datasets = pipeline.load_processed_data()

    if 'X_train' in datasets and 'X_val' in datasets:
        X_train = datasets['X_train']
        X_val = datasets['X_val']
        y_train = datasets['y_train']
        y_val = datasets['y_val']
        print("✅ Processed data loaded successfully from Phase 3")
    else:
        raise FileNotFoundError("Processed data not found")

except FileNotFoundError:
    print("⚠️ Processed data not found. Running preprocessing pipeline...")
    # Load and process data
    from src.data_loader import load_data_with_fallback
    
    train_data, test_data = load_data_with_fallback()
    pipeline = CaliforniaHousingPipeline(train_data, test_data)
    results = pipeline.run_pipeline(save_processed=True)
    
    X_train = pipeline.X_train
    X_val = pipeline.X_val
    y_train = pipeline.y_train
    y_val = pipeline.y_val
    
    print("✅ Data processed and ready for training")

print(f"\n📈 California Housing Dataset Ready:")
print(f"  • Training features: {X_train.shape}")
print(f"  • Validation features: {X_val.shape}")
print(f"  • Training target: {y_train.shape}")
print(f"  • Validation target: {y_val.shape}")
print(f"  • Feature names (first 10): {list(X_train.columns[:10])}")
print(f"  • Target statistics: Mean=${y_train.mean():,.0f}, Range=${y_train.min():,.0f}-${y_train.max():,.0f}")

## 2. Model Training Setup

In [None]:
# Initialize model trainer
print("🚀 Initializing California Housing Model Trainer...")
trainer = CaliforniaHousingModelTrainer(X_train, y_train, X_val, y_val)

# Register all models optimized for California housing
trainer.register_california_housing_models()

print(f"\n📝 Registered Models for California Housing:")
for i, model_name in enumerate(trainer.models.keys(), 1):
    model_class, params = trainer.models[model_name]
    print(f"  {i}. {model_name} ({model_class.__name__})")
    if params:
        key_params = {k: v for k, v in params.items() if k in ['alpha', 'n_estimators', 'learning_rate', 'max_depth']}
        if key_params:
            print(f"     Parameters: {key_params}")

print(f"\n⚙️ Training Configuration:")
print(f"  • Cross-validation folds: 5")
print(f"  • Hyperparameter tuning: Will be applied to selected models")
print(f"  • Evaluation metrics: RMSE, R², MAE, MAPE")
print(f"  • Target: California house values (median_house_value)")

## 3. Model Training - Phase 1 (Default Parameters)

In [None]:
# Train all models with default parameters first (faster baseline)
print("🔧 Phase 1: Training models with default parameters...")
print("="*70)

# Train without hyperparameter tuning for quick baseline
trained_models = trainer.train_all_models(tune_hyperparameters=False, cv_folds=5)

print(f"\n✅ Phase 1 Training Completed: {len(trained_models)} models trained")

In [None]:
# Evaluate baseline model performance
print("📊 Evaluating baseline model performance...")

baseline_evaluation = trainer.evaluate_models()
print("\n📈 BASELINE MODEL PERFORMANCE:")
print(baseline_evaluation[['model', 'val_r2', 'val_rmse', 'val_mae', 'rmse_ratio']].round(4))

# Get baseline best model
baseline_best_name, baseline_best_model = trainer.get_best_model()
baseline_best_metrics = baseline_evaluation[baseline_evaluation['model'] == baseline_best_name].iloc[0]

print(f"\n🏆 Baseline Best Model: {baseline_best_name}")
print(f"  • Validation R²: {baseline_best_metrics['val_r2']:.4f}")
print(f"  • Validation RMSE: ${baseline_best_metrics['val_rmse']:,.0f}")
print(f"  • RMSE as % of mean price: {baseline_best_metrics.get('rmse_relative_pct', 0):.1f}%")

In [None]:
# Visualize baseline model comparison
print("📊 Creating baseline model comparison visualizations...")
trainer.plot_model_comparison(figsize=(16, 12))

## 4. Model Training - Phase 2 (Hyperparameter Tuning)

In [None]:
# Hyperparameter tuning for top performing models
print("🔍 Phase 2: Hyperparameter tuning for top models...")
print("="*70)

# Select top 3 models for tuning based on baseline performance
top_3_models = baseline_evaluation.head(3)['model'].tolist()
print(f"\n🎯 Selected models for hyperparameter tuning: {top_3_models}")

# Create new trainer for tuned models
tuning_trainer = CaliforniaHousingModelTrainer(X_train, y_train, X_val, y_val)

# Register only the top models for tuning
for model_name in top_3_models:
    if model_name in trainer.models:
        model_class, kwargs = trainer.models[model_name]
        tuning_trainer.register_model(model_class, f'{model_name}_tuned', **kwargs)

print(f"\n🔄 Training {len(tuning_trainer.models)} models with hyperparameter tuning...")
tuned_models = tuning_trainer.train_all_models(tune_hyperparameters=True, cv_folds=5)

print(f"\n✅ Phase 2 Tuning Completed: {len(tuned_models)} models tuned")

In [None]:
# Compare tuned vs baseline models
if tuned_models:
    tuned_evaluation = tuning_trainer.evaluate_models()
    
    print("\n📊 TUNED MODELS PERFORMANCE:")
    print(tuned_evaluation[['model', 'val_r2', 'val_rmse', 'val_mae', 'rmse_ratio']].round(4))
    
    # Performance improvement analysis
    print(f"\n📈 BASELINE vs TUNED COMPARISON:")
    print(f"{'Model':<20} {'Baseline R²':<12} {'Tuned R²':<12} {'Improvement':<12}")
    print(f"-"*60)
    
    for original_name in top_3_models:
        tuned_name = f'{original_name}_tuned'
        
        if tuned_name in tuned_evaluation['model'].values:
            baseline_r2 = baseline_evaluation[baseline_evaluation['model'] == original_name]['val_r2'].iloc[0]
            tuned_r2 = tuned_evaluation[tuned_evaluation['model'] == tuned_name]['val_r2'].iloc[0]
            improvement = tuned_r2 - baseline_r2
            
            print(f"{original_name:<20} {baseline_r2:<12.4f} {tuned_r2:<12.4f} {improvement:+.4f}")
else:
    print("⚠️ No tuned models available")

## 5. Comprehensive Model Validation

In [None]:
# Combine all models for final evaluation
all_models = {**trained_models, **tuned_models}

print(f"🔍 Comprehensive validation of {len(all_models)} models...")

# Initialize validator
validator = CaliforniaHousingModelValidator(all_models, X_train, y_train, X_val, y_val)

# Create comprehensive performance plots
validator.plot_model_performance(figsize=(20, 15))

In [None]:
# Final model comparison and selection
final_comparison = validator.compare_models()

print("🏆 FINAL MODEL COMPARISON - CALIFORNIA HOUSING")
print("="*70)
print(final_comparison[['model', 'val_r2', 'val_rmse', 'val_mae', 'rmse_ratio', 'rmse_as_pct_of_mean_price']].round(4))

# Select final best model
final_best_name = final_comparison.iloc[0]['model']
final_best_model = all_models[final_best_name]
final_best_metrics = final_comparison.iloc[0]

print(f"\n🎯 FINAL BEST MODEL: {final_best_name}")
print(f"  • Validation R²: {final_best_metrics['val_r2']:.4f}")
print(f"  • Validation RMSE: ${final_best_metrics['val_rmse']:,.0f}")
print(f"  • Validation MAE: ${final_best_metrics['val_mae']:,.0f}")
print(f"  • RMSE as % of mean house value: {final_best_metrics['rmse_as_pct_of_mean_price']:.1f}%")
print(f"  • Predictions within $20K: {final_best_metrics.get('prediction_accuracy_within_20k', 'N/A')}%")

## 6. Best Model Deep Analysis

In [None]:
# Detailed analysis of the best performing model
print(f"🔍 DEEP ANALYSIS: {final_best_name}")
print("="*70)

# Print detailed performance summary
final_best_model.print_performance_summary()

# Feature importance analysis
print(f"\n📊 Feature Importance Analysis:")
importance_df = final_best_model.get_feature_importance()

if importance_df is not None:
    print(f"\n🔝 Top 15 Most Important Features:")
    for i, (_, row) in enumerate(importance_df.head(15).iterrows(), 1):
        print(f"  {i:2d}. {row['feature']:<30}: {row['importance']:.4f}")
    
    # Plot feature importance
    final_best_model.plot_feature_importance(top_n=20, figsize=(12, 10))
else:
    print("⚠️ Feature importance not available for this model type")

In [None]:
# Predictions vs actual visualization for best model
print(f"📈 Prediction Analysis for {final_best_name}...")

# Plot predictions vs actual
final_best_model.plot_predictions(
    X_val, y_val, 
    title=f'{final_best_name} - California Housing Validation Set Predictions',
    figsize=(12, 8)
)

# Detailed residual analysis
validator.plot_residual_analysis(final_best_name, figsize=(18, 12))

In [None]:
# Learning curves for best model
print(f"📚 Learning Curves Analysis for {final_best_name}...")
validator.plot_learning_curves(final_best_name, figsize=(12, 6))

# Prediction confidence analysis
confidence_analysis = validator.create_prediction_confidence_analysis(final_best_name)

print(f"\n🎯 PREDICTION CONFIDENCE BY HOUSE VALUE RANGE:")
print(f"{'Price Range':<15} {'Sample Count':<12} {'Mean Error':<12} {'Accuracy (±$20K)':<15}")
print(f"-"*70)

for range_name, analysis in confidence_analysis.items():
    print(f"{range_name:<15} {analysis['sample_count']:<12} "
          f"${analysis['mean_error']:<11,.0f} {analysis['accuracy_within_20k']:<14.1f}%")
    print(f"{'':>15} {analysis['price_range']}")
    print()

## 7. Model Interpretability Analysis

In [None]:
# Model interpretability for California housing
print("🧠 MODEL INTERPRETABILITY ANALYSIS")
print("="*70)

# Feature importance across different model types
feature_importance_comparison = {}

# Get feature importance from different model types
model_types_for_interpretation = ['linear', 'ridge', 'lasso', 'random_forest']
available_models = [name for name in model_types_for_interpretation if name in all_models]

for model_name in available_models:
    model = all_models[model_name]
    importance_df = model.get_feature_importance()
    
    if importance_df is not None:
        # Get top 10 features for this model
        top_features = importance_df.head(10)
        feature_importance_comparison[model_name] = dict(zip(top_features['feature'], top_features['importance']))

# Find most consistently important features
if feature_importance_comparison:
    all_important_features = set()
    for features in feature_importance_comparison.values():
        all_important_features.update(features.keys())
    
    # Count how many models consider each feature important
    feature_consensus = {}
    for feature in all_important_features:
        count = sum(1 for features in feature_importance_comparison.values() if feature in features)
        avg_importance = np.mean([features.get(feature, 0) for features in feature_importance_comparison.values()])
        feature_consensus[feature] = {'model_count': count, 'avg_importance': avg_importance}
    
    # Sort by consensus (model count, then average importance)
    consensus_ranking = sorted(feature_consensus.items(), 
                              key=lambda x: (x[1]['model_count'], x[1]['avg_importance']), 
                              reverse=True)
    
    print(f"\n🎯 FEATURE IMPORTANCE CONSENSUS (Top 15):")
    print(f"{'Feature':<30} {'Models':<8} {'Avg Importance':<15}")
    print(f"-"*60)
    
    for feature, stats in consensus_ranking[:15]:
        print(f"{feature:<30} {stats['model_count']}/{len(feature_importance_comparison):<7} {stats['avg_importance']:<15.4f}")

# California housing specific insights
print(f"\n🏠 CALIFORNIA HOUSING INSIGHTS:")
if importance_df is not None:
    top_features = importance_df.head(5)['feature'].tolist()
    
    housing_insights = {
        'median_income': "💰 Income is the strongest predictor of housing values in CA",
        'total_rooms': "🏠 Larger properties command higher prices",
        'housing_median_age': "📅 Newer homes are generally more valuable",
        'longitude': "🗺️ East-West location affects prices (proximity to coast)",
        'latitude': "🗺️ North-South location indicates different CA markets",
        'population': "👥 Population density impacts housing demand",
        'ocean_proximity': "🌊 Proximity to ocean significantly affects values",
        'rooms_per_household': "📏 Housing density is a key value driver",
        'distance_to_Los_Angeles': "🏙️ Distance from major cities affects prices",
        'distance_to_San_Francisco': "🏙️ SF proximity is a major price factor"
    }
    
    print(f"\nKey predictive insights from top features:")
    for feature in top_features:
        for key, insight in housing_insights.items():
            if key in feature.lower():
                print(f"  • {insight}")
                break
        else:
            print(f"  • {feature}: Important engineered feature for CA housing")

## 8. Final Model Selection and Evaluation

In [None]:
# Final model selection combining baseline and tuned models
print("🏆 FINAL MODEL SELECTION")
print("="*70)

# Combine all models for final comparison
final_validator = CaliforniaHousingModelValidator(all_models, X_train, y_train, X_val, y_val)
final_comparison = final_validator.compare_models()

print("\n📊 FINAL MODEL RANKING:")
print(final_comparison[['model', 'val_r2', 'val_rmse', 'rmse_as_pct_of_mean_price', 'prediction_accuracy_within_20k']].round(4))

# Select champion model
champion_model_name = final_comparison.iloc[0]['model']
champion_model = all_models[champion_model_name]
champion_metrics = final_comparison.iloc[0]

print(f"\n🥇 CHAMPION MODEL: {champion_model_name}")
print(f"  • Final Validation R²: {champion_metrics['val_r2']:.4f}")
print(f"  • Final Validation RMSE: ${champion_metrics['val_rmse']:,.0f}")
print(f"  • Error as % of mean house value: {champion_metrics['rmse_as_pct_of_mean_price']:.1f}%")
print(f"  • Predictions within $20K: {champion_metrics.get('prediction_accuracy_within_20k', 'N/A'):.1f}%")

# Model performance interpretation
rmse_pct = champion_metrics['rmse_as_pct_of_mean_price']
if rmse_pct < 10:
    performance_grade = "Excellent"
elif rmse_pct < 15:
    performance_grade = "Very Good"
elif rmse_pct < 20:
    performance_grade = "Good"
elif rmse_pct < 30:
    performance_grade = "Fair"
else:
    performance_grade = "Needs Improvement"

print(f"\n📈 Model Performance Grade: {performance_grade}")
print(f"💡 Interpretation: The model's predictions are typically within {rmse_pct:.1f}% of the actual house value")

## 9. Champion Model Analysis

In [None]:
# Comprehensive analysis of the champion model
print(f"🏆 CHAMPION MODEL DEEP DIVE: {champion_model_name}")
print("="*70)

# Detailed validation
champion_validation = validator.validate_single_model(champion_model)

# Plot comprehensive analysis
champion_model.plot_predictions(X_val, y_val, 
                               title=f'{champion_model_name} - California Housing Predictions',
                               figsize=(12, 8))

# Residual analysis
champion_model.plot_residuals(X_val, y_val, figsize=(18, 6))

# Feature importance for champion
if champion_model.get_feature_importance() is not None:
    champion_model.plot_feature_importance(top_n=20, figsize=(12, 10))

# Learning curves
validator.plot_learning_curves(champion_model_name, figsize=(12, 6))

## 10. Model Comparison Across Price Ranges

In [None]:
# Analyze model performance across different house value ranges
print("🏠 MODEL PERFORMANCE BY HOUSE VALUE RANGE")
print("="*70)

# Create price range analysis for top 3 models
top_3_final = final_comparison.head(3)['model'].tolist()

price_ranges = pd.qcut(y_val, q=5, labels=['Low ($)', 'Low-Med ($)', 'Medium ($)', 'Med-High ($)', 'High ($)'])

range_analysis = {}
for model_name in top_3_final:
    model = all_models[model_name]
    predictions = model.predict(X_val)
    
    range_performance = {}
    for price_range in price_ranges.cat.categories:
        mask = price_ranges == price_range
        if mask.sum() > 0:
            range_actual = y_val[mask]
            range_pred = predictions[mask]
            
            range_performance[price_range] = {
                'r2': r2_score(range_actual, range_pred),
                'rmse': np.sqrt(mean_squared_error(range_actual, range_pred)),
                'sample_count': len(range_actual),
                'mean_actual': range_actual.mean()
            }
    
    range_analysis[model_name] = range_performance

# Display results
for model_name, ranges in range_analysis.items():
    print(f"\n📊 {model_name} Performance by Price Range:")
    print(f"{'Range':<12} {'Samples':<8} {'Mean Price':<12} {'R²':<8} {'RMSE':<10}")
    print(f"-"*60)
    
    for range_name, metrics in ranges.items():
        print(f"{range_name:<12} {metrics['sample_count']:<8} "
              f"${metrics['mean_actual']:<11,.0f} {metrics['r2']:<8.3f} "
              f"${metrics['rmse']:<9,.0f}")

## 11. Model Saving and Export

In [None]:
# Save all trained models
print("💾 SAVING TRAINED MODELS")
print("="*70)

# Save baseline models
print("\n📦 Saving baseline models...")
trainer.save_models()

# Save tuned models
if tuned_models:
    print("\n📦 Saving tuned models...")
    tuning_trainer.save_models()

# Save champion model separately for easy access
from config.settings import MODELS_DIR
champion_path = MODELS_DIR / 'champion_california_housing_model.pkl'
champion_model.save_model(champion_path)

print(f"\n🏆 Champion model saved separately: {champion_path}")

# Save final comparison results
final_comparison.to_csv(MODELS_DIR / 'final_california_housing_model_comparison.csv', index=False)

# Create deployment summary
deployment_summary = {
    'champion_model': champion_model_name,
    'champion_r2': float(champion_metrics['val_r2']),
    'champion_rmse': float(champion_metrics['val_rmse']),
    'dataset': 'california_housing',
    'training_samples': len(X_train),
    'validation_samples': len(X_val),
    'feature_count': X_train.shape[1],
    'target_column': 'median_house_value',
    'training_date': datetime.now().isoformat(),
    'all_models_trained': list(all_models.keys())
}

import json
with open(MODELS_DIR / 'deployment_summary.json', 'w') as f:
    json.dump(deployment_summary, f, indent=2)

print(f"\n✅ All models and results saved to: {MODELS_DIR}")
print(f"📋 Files saved:")
import os
for file in os.listdir(MODELS_DIR):
    if file.endswith(('.pkl', '.csv', '.json')):
        print(f"  • {file}")

## 12. Final Training Report

In [None]:
# Generate comprehensive training report
print("📋 GENERATING FINAL TRAINING REPORT...")
print("="*70)

# Generate baseline training report
baseline_report = trainer.generate_training_report()
print(baseline_report)

print("\n" + "="*80 + "\n")

# Generate validation report
validation_report = final_validator.generate_validation_report()
print(validation_report)

print(f"\n🎉 PHASE 4: MODEL DEVELOPMENT & TRAINING - COMPLETED!")
print(f"\n📊 Final Results Summary:")
print(f"  • Champion Model: {champion_model_name}")
print(f"  • Best Validation R²: {champion_metrics['val_r2']:.4f}")
print(f"  • Best Validation RMSE: ${champion_metrics['val_rmse']:,.0f}")
print(f"  • Models Trained: {len(all_models)}")
print(f"  • Features Used: {X_train.shape[1]}")
print(f"  • Dataset: California Housing ({len(X_train):,} properties)")

print(f"\n🚀 Ready for Phase 5: Web Application & Deployment!")
print(f"💡 The champion model is ready to predict California housing prices!")