In [3]:
"""
MACHINE LEARNING EDUCATION PERFORMANCE PREDICTOR
================================================
Enhances your data pipeline with predictive ML models
Perfect addition to your ML/AI Engineering Portfolio

This module adds:
1. Performance prediction models
2. Feature importance analysis
3. ROI optimization recommendations
4. Policy impact simulations
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.inspection import permutation_importance
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import joblib
import warnings
warnings.filterwarnings('ignore')

class EducationMLPredictor:
    """
    Machine Learning models for education performance prediction
    """
    
    def __init__(self, data_pipeline=None):
        """
        Initialize with data from your pipeline
        
        Args:
            data_pipeline: Your RobustEducationDataPipeline instance
        """
        self.pipeline = data_pipeline
        self.models = {}
        self.best_model = None
        self.feature_importance = None
        self.predictions = {}
        
        # Load consolidated data
        if data_pipeline and 'consolidated' in data_pipeline.data:
            self.data = data_pipeline.data['consolidated']
        else:
            self.data = self._load_default_data()
        
        print("🤖 ML Education Predictor Initialized")
        print(f"📊 Data shape: {self.data.shape if self.data is not None else 'No data'}")
    
    def _load_default_data(self):
        """Load default education data if pipeline not available"""
        
        # Real education data for ML modeling
        data = {
            'state': ['Massachusetts', 'Minnesota', 'New Hampshire', 'New Jersey', 'North Dakota',
                     'Utah', 'Vermont', 'Wisconsin', 'South Dakota', 'Maine', 'Montana', 'Idaho',
                     'Indiana', 'Iowa', 'Kansas', 'Virginia', 'Wyoming', 'Ohio', 'Colorado', 'Texas',
                     'Pennsylvania', 'Connecticut', 'Oregon', 'New York', 'Delaware', 'Illinois',
                     'North Carolina', 'Missouri', 'Alaska', 'Rhode Island', 'Florida', 'Tennessee',
                     'South Carolina', 'Maryland', 'Kentucky', 'Arizona', 'Georgia', 'Michigan',
                     'Nevada', 'Oklahoma', 'Arkansas', 'Hawaii', 'Alabama', 'West Virginia',
                     'Louisiana', 'Mississippi', 'California', 'New Mexico', 'District of Columbia'],
            
            'naep_math_2024': [288, 286, 286, 285, 284, 283, 283, 283, 282, 282, 282, 280,
                               281, 280, 280, 282, 281, 279, 278, 278, 277, 277, 276, 275,
                               275, 275, 274, 274, 274, 271, 271, 271, 271, 271, 270, 270,
                               268, 268, 268, 268, 267, 268, 262, 265, 264, 264, 261, 259, 254],
            
            'per_pupil_spending': [21524, 14752, 19698, 25099, 15265, 9552, 24608, 14031, 11476,
                                  17310, 13206, 9670, 11945, 13404, 13537, 13634, 19136, 14532,
                                  12638, 10886, 19124, 24453, 14989, 29873, 18866, 18316, 11312,
                                  12775, 19553, 19558, 10586, 10958, 12973, 17590, 12833, 10315,
                                  12764, 14413, 11215, 10890, 11948, 18611, 11503, 13602, 13538,
                                  10984, 15831, 11596, 27425],
            
            'poverty_rate': [10.7, 9.3, 6.4, 10.2, 9.6, 8.1, 10.4, 11.6, 12.5, 12.1, 13.1, 11.6,
                            14.1, 11.1, 12.3, 11.0, 11.1, 17.0, 9.6, 18.4, 14.8, 11.8, 12.5, 17.0,
                            13.5, 14.4, 16.7, 15.4, 11.4, 13.6, 16.2, 17.4, 18.8, 10.1, 19.1, 17.0,
                            17.5, 16.5, 14.8, 18.7, 20.2, 10.3, 20.7, 20.8, 24.0, 25.1, 14.3, 21.9, 21.6]
        }
        
        df = pd.DataFrame(data)
        
        # Add derived features
        df['spending_per_poverty_point'] = df['per_pupil_spending'] / df['poverty_rate']
        df['high_poverty'] = (df['poverty_rate'] > 15).astype(int)
        df['spending_efficiency'] = df['naep_math_2024'] / (df['per_pupil_spending'] / 1000)
        
        return df
    
    def prepare_features(self):
        """Prepare features for ML modeling"""
        print("\n🔧 Preparing Features for ML...")
        
        # Select features
        feature_columns = [
            'per_pupil_spending',
            'poverty_rate',
            'spending_per_poverty_point',
            'high_poverty',
            'spending_efficiency'
        ]
        
        # Ensure all features exist
        available_features = [col for col in feature_columns if col in self.data.columns]
        
        self.X = self.data[available_features].copy()
        self.y = self.data['naep_math_2024'].copy()
        
        # Handle missing values
        self.X = self.X.fillna(self.X.mean())
        
        # Scale features
        self.scaler = StandardScaler()
        self.X_scaled = self.scaler.fit_transform(self.X)
        
        # Split data
        self.X_train, self.X_test, self.y_train, self.y_test 