### WHO Life Expectancy Dataset - Phase 2B: Quick Model Validation
### Testing EDA predictions with baseline models

In [10]:
# WHO Life Expectancy Dataset - Phase 2B: Quick Model Validation
# Testing EDA predictions with baseline models

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

class BaselineModelValidator:
    """
    Quick baseline models to validate EDA predictions
    Tests feature importance and performance expectations
    """
    
    def __init__(self):
        self.models = {}
        self.results = {}
        self.scaler = StandardScaler()
        
    def load_and_prepare_data(self):
        """Load data and prepare for modeling based on EDA insights"""
        print("=" * 60)
        print("PHASE 2B: BASELINE MODEL VALIDATION")
        print("=" * 60)
        
        # Load the original data
        df = pd.read_csv('../Life Expectancy Data.csv')
        df.columns = df.columns.str.strip()
        
        print(f"Dataset loaded: {df.shape[0]} records, {df.shape[1]} features")
        
        # Basic preprocessing for quick testing
        # Remove rows where target is missing
        df_clean = df.dropna(subset=['Life expectancy']).copy()
        print(f"After removing missing targets: {df_clean.shape[0]} records")
        
        return df_clean
    
    def create_regional_mapping(self, df):
        """Quick regional mapping for testing"""
        def assign_region(country):
            africa_keywords = ['Algeria', 'Angola', 'Botswana', 'Burkina', 'Burundi', 'Cameroon', 'Chad', 'Congo', 
                              'Ethiopia', 'Ghana', 'Kenya', 'Madagascar', 'Malawi', 'Mali', 'Morocco', 'Niger', 
                              'Nigeria', 'Rwanda', 'Senegal', 'Sierra Leone', 'Somalia', 'South Africa', 'Sudan', 
                              'Tanzania', 'Togo', 'Tunisia', 'Uganda', 'Zambia', 'Zimbabwe']
            europe_keywords = ['Albania', 'Austria', 'Belgium', 'Croatia', 'Cyprus', 'Czech', 'Denmark', 'Estonia', 
                              'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 
                              'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Norway', 'Poland', 
                              'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 
                              'United Kingdom']
            asia_keywords = ['Afghanistan', 'Bangladesh', 'Bhutan', 'Cambodia', 'China', 'India', 'Indonesia', 
                            'Iran', 'Iraq', 'Japan', 'Jordan', 'Kazakhstan', 'Kuwait', 'Kyrgyzstan', 'Laos', 
                            'Lebanon', 'Malaysia', 'Maldives', 'Mongolia', 'Myanmar', 'Nepal', 'Oman', 'Pakistan', 
                            'Philippines', 'Qatar', 'Saudi Arabia', 'Singapore', 'Sri Lanka', 'Syria', 'Tajikistan', 
                            'Thailand', 'Turkey', 'Turkmenistan', 'United Arab Emirates', 'Uzbekistan', 'Viet Nam', 'Yemen']
            americas_keywords = ['Argentina', 'Bolivia', 'Brazil', 'Canada', 'Chile', 'Colombia', 'Costa Rica', 
                               'Cuba', 'Dominican Republic', 'Ecuador', 'El Salvador', 'Guatemala', 'Haiti', 
                               'Honduras', 'Jamaica', 'Mexico', 'Nicaragua', 'Panama', 'Paraguay', 'Peru', 
                               'Trinidad and Tobago', 'United States', 'Uruguay', 'Venezuela']
            
            if any(keyword in country for keyword in africa_keywords):
                return 'Africa'
            elif any(keyword in country for keyword in europe_keywords):
                return 'Europe'
            elif any(keyword in country for keyword in asia_keywords):
                return 'Asia'
            elif any(keyword in country for keyword in americas_keywords):
                return 'Americas'
            else:
                return 'Other'
        
        df['Region'] = df['Country'].apply(assign_region)
        return df
    
    def select_eda_features(self, df):
        """Select features based on EDA insights for quick testing"""
        print("\nFEATURE SELECTION BASED ON EDA INSIGHTS")
        print("-" * 40)
        
        # Tier 1 features (highest importance from EDA)
        tier1_features = ['Schooling', 'Adult Mortality', 'HIV/AIDS']
        
        # Tier 2 features (strong predictors from EDA)
        tier2_features = ['Income composition of resources', 'BMI', 'GDP']
        
        # Available features check
        available_tier1 = [f for f in tier1_features if f in df.columns and df[f].notna().sum() > 100]
        available_tier2 = [f for f in tier2_features if f in df.columns and df[f].notna().sum() > 100]
        
        print("Tier 1 features (EDA top predictors):")
        for feature in available_tier1:
            correlation = df[feature].corr(df['Life expectancy'])
            missing_pct = (df[feature].isnull().sum() / len(df)) * 100
            print(f"  {feature}: correlation={correlation:.3f}, missing={missing_pct:.1f}%")
        
        print("\nTier 2 features (EDA strong predictors):")
        for feature in available_tier2:
            correlation = df[feature].corr(df['Life expectancy'])
            missing_pct = (df[feature].isnull().sum() / len(df)) * 100
            print(f"  {feature}: correlation={correlation:.3f}, missing={missing_pct:.1f}%")
        
        # Add categorical features
        df['Status_Developed'] = (df['Status'] == 'Developed').astype(int)
        
        # Regional dummies
        regional_dummies = pd.get_dummies(df['Region'], prefix='Region')
        df = pd.concat([df, regional_dummies], axis=1)
        
        categorical_features = ['Status_Developed'] + list(regional_dummies.columns)
        
        all_features = available_tier1 + available_tier2 + categorical_features
        
        print(f"\nTotal features selected: {len(all_features)}")
        
        return df, all_features
    
    def prepare_modeling_data(self, df, features):
        """Prepare data for modeling with basic missing value handling"""
        print("\nDATA PREPARATION FOR MODELING")
        print("-" * 40)
        
        # Create feature matrix and target vector
        feature_df = df[features + ['Life expectancy']].copy()
        
        # Basic missing value imputation (median for numerical, mode for categorical)
        for col in features:
            if feature_df[col].dtype in ['object', 'category']:
                feature_df[col].fillna(feature_df[col].mode()[0], inplace=True)
            else:
                feature_df[col].fillna(feature_df[col].median(), inplace=True)
        
        # Remove any remaining rows with missing target
        feature_df = feature_df.dropna(subset=['Life expectancy'])
        
        X = feature_df[features]
        y = feature_df['Life expectancy']
        
        print(f"Final modeling dataset: {X.shape[0]} samples, {X.shape[1]} features")
        print(f"Target variable range: {y.min():.1f} - {y.max():.1f} years")
        
        return X, y
    
    def create_temporal_split(self, df, X, y):
        """Create temporal train/test split based on EDA insights"""
        print("\nTEMPORAL TRAIN/TEST SPLIT")
        print("-" * 40)
        
        # EDA showed consistent trends, so use temporal split: 2000-2012 train, 2013-2015 test
        df_with_target = df.dropna(subset=['Life expectancy']).copy()
        
        # Align indices
        aligned_indices = X.index.intersection(df_with_target.index)
        X_aligned = X.loc[aligned_indices]
        y_aligned = y.loc[aligned_indices]
        df_aligned = df_with_target.loc[aligned_indices]
        
        train_mask = df_aligned['Year'] <= 2012
        test_mask = df_aligned['Year'] >= 2013
        
        X_train = X_aligned[train_mask]
        X_test = X_aligned[test_mask]
        y_train = y_aligned[train_mask]
        y_test = y_aligned[test_mask]
        
        print(f"Training set: {X_train.shape[0]} samples (years 2000-2012)")
        print(f"Test set: {X_test.shape[0]} samples (years 2013-2015)")
        print(f"Training target range: {y_train.min():.1f} - {y_train.max():.1f} years")
        print(f"Test target range: {y_test.min():.1f} - {y_test.max():.1f} years")
        
        return X_train, X_test, y_train, y_test
    
    def train_baseline_models(self, X_train, y_train):
        """Train baseline models to test EDA predictions"""
        print("\nTRAINING BASELINE MODELS")
        print("-" * 40)
        
        # Scale features for linear models
        X_train_scaled = self.scaler.fit_transform(X_train)
        
        models = {
            'Linear Regression': LinearRegression(),
            'Ridge Regression': Ridge(alpha=1.0),
            'Lasso Regression': Lasso(alpha=0.1),
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
        }
        
        print("Training models...")
        for name, model in models.items():
            if name == 'Random Forest':
                # Random Forest can handle unscaled data
                model.fit(X_train, y_train)
            else:
                # Linear models use scaled data
                model.fit(X_train_scaled, y_train)
            
            self.models[name] = model
            print(f"  {name}: trained")
        
        return X_train_scaled
    
    def evaluate_models(self, X_train, X_test, y_train, y_test, X_train_scaled):
        """Evaluate models and compare to EDA predictions"""
        print("\nMODEL EVALUATION")
        print("-" * 40)
        
        # Scale test data
        X_test_scaled = self.scaler.transform(X_test)
        
        results = {}
        
        for name, model in self.models.items():
            if name == 'Random Forest':
                # Use unscaled data for Random Forest
                train_pred = model.predict(X_train)
                test_pred = model.predict(X_test)
            else:
                # Use scaled data for linear models
                train_pred = model.predict(X_train_scaled)
                test_pred = model.predict(X_test_scaled)
            
            # Calculate metrics
            train_r2 = r2_score(y_train, train_pred)
            test_r2 = r2_score(y_test, test_pred)
            train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
            test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
            train_mae = mean_absolute_error(y_train, train_pred)
            test_mae = mean_absolute_error(y_test, test_pred)
            
            results[name] = {
                'train_r2': train_r2,
                'test_r2': test_r2,
                'train_rmse': train_rmse,
                'test_rmse': test_rmse,
                'train_mae': train_mae,
                'test_mae': test_mae
            }
            
            print(f"{name}:")
            print(f"  Train R²: {train_r2:.3f}, Test R²: {test_r2:.3f}")
            print(f"  Train RMSE: {train_rmse:.2f}, Test RMSE: {test_rmse:.2f}")
            print(f"  Train MAE: {train_mae:.2f}, Test MAE: {test_mae:.2f}")
            print()
        
        self.results = results
        return results
    
    def validate_feature_importance(self, X_train, feature_names):
        """Validate feature importance against EDA findings"""
        print("FEATURE IMPORTANCE VALIDATION")
        print("-" * 40)
        
        # Get feature importance from Random Forest
        rf_model = self.models['Random Forest']
        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'importance': rf_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print("Top 10 features by Random Forest importance:")
        for i, row in feature_importance.head(10).iterrows():
            print(f"  {row['feature']}: {row['importance']:.3f}")
        
        # Compare to EDA predictions
        print("\nEDA PREDICTION VALIDATION:")
        eda_top_features = ['Schooling', 'Adult Mortality', 'HIV/AIDS']
        
        for feature in eda_top_features:
            if feature in feature_importance['feature'].values:
                rank = feature_importance[feature_importance['feature'] == feature].index[0] + 1
                importance = feature_importance[feature_importance['feature'] == feature]['importance'].iloc[0]
                print(f"  {feature}: Rank #{rank}, Importance: {importance:.3f}")
            else:
                print(f"  {feature}: Not in selected features")
        
        return feature_importance
    
    def compare_to_eda_predictions(self, results):
        """Compare model performance to EDA predictions"""
        print("EDA PREDICTION COMPARISON")
        print("-" * 40)
        
        print("EDA Prediction: R² should be 0.75-0.85")
        print("Actual Results:")
        
        best_model = None
        best_r2 = 0
        
        for model_name, metrics in results.items():
            test_r2 = metrics['test_r2']
            print(f"  {model_name}: R² = {test_r2:.3f}", end="")
            
            if test_r2 >= 0.75:
                print(" [MEETS EDA PREDICTION]")
            elif test_r2 >= 0.70:
                print(" [CLOSE TO EDA PREDICTION]")
            else:
                print(" [BELOW EDA PREDICTION]")
            
            if test_r2 > best_r2:
                best_r2 = test_r2
                best_model = model_name
        
        print(f"\nBest performing model: {best_model} (R² = {best_r2:.3f})")
        
        if best_r2 >= 0.75:
            print("SUCCESS: EDA predictions validated!")
        elif best_r2 >= 0.65:
            print("PARTIAL: Good performance, may need feature engineering")
        else:
            print("REVIEW: Performance below expectations, investigate further")
        
        return best_model, best_r2
    
    def generate_validation_report(self, results, feature_importance, best_model, best_r2):
        """Generate comprehensive validation report"""
        print("\n" + "=" * 60)
        print("PHASE 2B VALIDATION REPORT")
        print("=" * 60)
        
        print("PERFORMANCE SUMMARY:")
        print(f"  Best Model: {best_model}")
        print(f"  Best R²: {best_r2:.3f}")
        print(f"  EDA Prediction Range: 0.75-0.85")
        print(f"  Prediction Accuracy: {'VALIDATED' if best_r2 >= 0.75 else 'NEEDS IMPROVEMENT'}")
        
        print("\nFEATURE VALIDATION:")
        eda_features = ['Schooling', 'Adult Mortality', 'HIV/AIDS']
        rf_top3 = feature_importance.head(3)['feature'].tolist()
        
        matches = len(set(eda_features) & set(rf_top3))
        print(f"  EDA Top 3 in Model Top 3: {matches}/3")
        print(f"  Feature Prediction Accuracy: {'VALIDATED' if matches >= 2 else 'PARTIAL'}")
        
        print("\nNEXT STEPS:")
        if best_r2 >= 0.75:
            print("  Phase 2B SUCCESS - Ready for advanced modeling")
            print("  Recommend: Proceed to Phase 3 with full preprocessing")
        elif best_r2 >= 0.65:
            print("  Phase 2B PARTIAL - Good foundation, needs enhancement")
            print("  Recommend: Implement full feature engineering pipeline")
        else:
            print("  Phase 2B REVIEW - Need to investigate further")
            print("  Recommend: Review feature selection and data quality")
        
        return {
            'best_model': best_model,
            'best_r2': best_r2,
            'eda_validated': best_r2 >= 0.75,
            'feature_matches': matches
        }

def run_baseline_validation():
    """Run the complete baseline validation pipeline"""
    
    # Initialize validator
    validator = BaselineModelValidator()
    
    # Step 1: Load and prepare data
    df = validator.load_and_prepare_data()
    
    # Step 2: Create regional mapping
    df = validator.create_regional_mapping(df)
    
    # Step 3: Select features based on EDA insights
    df, features = validator.select_eda_features(df)
    
    # Step 4: Prepare modeling data
    X, y = validator.prepare_modeling_data(df, features)
    
    # Step 5: Create temporal split
    X_train, X_test, y_train, y_test = validator.create_temporal_split(df, X, y)
    
    # Step 6: Train baseline models
    X_train_scaled = validator.train_baseline_models(X_train, y_train)
    
    # Step 7: Evaluate models
    results = validator.evaluate_models(X_train, X_test, y_train, y_test, X_train_scaled)
    
    # Step 8: Validate feature importance
    feature_importance = validator.validate_feature_importance(X_train, features)
    
    # Step 9: Compare to EDA predictions
    best_model, best_r2 = validator.compare_to_eda_predictions(results)
    
    # Step 10: Generate validation report
    report = validator.generate_validation_report(results, feature_importance, best_model, best_r2)
    
    return validator, results, feature_importance, report

if __name__ == "__main__":
    # Run the complete baseline validation
    validator, results, feature_importance, validation_report = run_baseline_validation()
    
    print("\n" + "="*60)
    print("PHASE 2B BASELINE VALIDATION COMPLETED")
    print("="*60)
    print("Ready for Phase 3: Advanced Model Development")
    print("="*60) 

PHASE 2B: BASELINE MODEL VALIDATION
Dataset loaded: 2938 records, 22 features
After removing missing targets: 2928 records

FEATURE SELECTION BASED ON EDA INSIGHTS
----------------------------------------
Tier 1 features (EDA top predictors):
  Schooling: correlation=0.752, missing=5.5%
  Adult Mortality: correlation=-0.696, missing=0.0%
  HIV/AIDS: correlation=-0.557, missing=0.0%

Tier 2 features (EDA strong predictors):
  Income composition of resources: correlation=0.725, missing=5.5%
  BMI: correlation=0.568, missing=1.1%
  GDP: correlation=0.461, missing=15.1%

Total features selected: 12

DATA PREPARATION FOR MODELING
----------------------------------------
Final modeling dataset: 2928 samples, 12 features
Target variable range: 36.3 - 89.0 years

TEMPORAL TRAIN/TEST SPLIT
----------------------------------------
Training set: 2379 samples (years 2000-2012)
Test set: 549 samples (years 2013-2015)
Training target range: 36.3 - 89.0 years
Test target range: 48.1 - 89.0 years

TRA