## WHO Life Expectancy Dataset - Phase 2A: Smart Data Preprocessing
### - Based on comprehensive EDA insights

In [2]:
# WHO Life Expectancy Dataset - Phase 2A: Smart Data Preprocessing
# Based on comprehensive EDA insights

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
import warnings
warnings.filterwarnings('ignore')

# Set up visualization parameters
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

class LifeExpectancyPreprocessor:
    """
    Comprehensive preprocessing pipeline for WHO Life Expectancy data
    Based on EDA insights from comprehensive analysis
    """
    
    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.regional_mapping = {}
        self.preprocessing_report = {}
        
    def load_and_setup_data(self):
        """Load data and perform initial setup"""
        print("=" * 80)
        print("PHASE 2A: SMART DATA PREPROCESSING")
        print("=" * 80)
        
        # Load the data
        df = pd.read_csv('../Life Expectancy Data.csv')
        df.columns = df.columns.str.strip()
        
        print(f"📊 Initial Dataset: {df.shape[0]} records, {df.shape[1]} features")
        print(f"📅 Time period: {df['Year'].min()}-{df['Year'].max()}")
        print(f"🌍 Countries: {df['Country'].nunique()}")
        
        return df
    
    def analyze_missing_data_patterns(self, df):
        """Analyze missing data patterns based on EDA insights"""
        print(f"\n🔍 MISSING DATA ANALYSIS")
        print("-" * 50)
        
        missing_summary = pd.DataFrame({
            'Column': df.columns,
            'Missing_Count': df.isnull().sum(),
            'Missing_Percentage': (df.isnull().sum() / len(df)) * 100,
            'Data_Type': df.dtypes
        }).sort_values('Missing_Percentage', ascending=False)
        
        high_missing = missing_summary[missing_summary['Missing_Percentage'] > 15]
        medium_missing = missing_summary[(missing_summary['Missing_Percentage'] >= 5) & 
                                       (missing_summary['Missing_Percentage'] <= 15)]
        
        print("HIGH PRIORITY (>15% missing):")
        for _, row in high_missing.iterrows():
            if row['Missing_Count'] > 0:
                print(f"  📊 {row['Column']}: {row['Missing_Count']} ({row['Missing_Percentage']:.1f}%)")
        
        print("\nMEDIUM PRIORITY (5-15% missing):")
        for _, row in medium_missing.iterrows():
            if row['Missing_Count'] > 0:
                print(f"  📈 {row['Column']}: {row['Missing_Count']} ({row['Missing_Percentage']:.1f}%)")
        
        return missing_summary
    
    def create_regional_mapping(self, df):
        """Create regional mapping based on EDA insights"""
        print(f"\n🌍 REGIONAL MAPPING CREATION")
        print("-" * 50)
        
        def assign_region(country):
            africa_keywords = ['Algeria', 'Angola', 'Botswana', 'Burkina', 'Burundi', 'Cameroon', 'Chad', 'Congo', 
                              'Ethiopia', 'Ghana', 'Kenya', 'Madagascar', 'Malawi', 'Mali', 'Morocco', 'Niger', 
                              'Nigeria', 'Rwanda', 'Senegal', 'Sierra Leone', 'Somalia', 'South Africa', 'Sudan', 
                              'Tanzania', 'Togo', 'Tunisia', 'Uganda', 'Zambia', 'Zimbabwe']
            europe_keywords = ['Albania', 'Austria', 'Belgium', 'Croatia', 'Cyprus', 'Czech', 'Denmark', 'Estonia', 
                              'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 
                              'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Norway', 'Poland', 
                              'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 
                              'United Kingdom']
            asia_keywords = ['Afghanistan', 'Bangladesh', 'Bhutan', 'Cambodia', 'China', 'India', 'Indonesia', 
                            'Iran', 'Iraq', 'Japan', 'Jordan', 'Kazakhstan', 'Kuwait', 'Kyrgyzstan', 'Laos', 
                            'Lebanon', 'Malaysia', 'Maldives', 'Mongolia', 'Myanmar', 'Nepal', 'Oman', 'Pakistan', 
                            'Philippines', 'Qatar', 'Saudi Arabia', 'Singapore', 'Sri Lanka', 'Syria', 'Tajikistan', 
                            'Thailand', 'Turkey', 'Turkmenistan', 'United Arab Emirates', 'Uzbekistan', 'Viet Nam', 'Yemen']
            americas_keywords = ['Argentina', 'Bolivia', 'Brazil', 'Canada', 'Chile', 'Colombia', 'Costa Rica', 
                               'Cuba', 'Dominican Republic', 'Ecuador', 'El Salvador', 'Guatemala', 'Haiti', 
                               'Honduras', 'Jamaica', 'Mexico', 'Nicaragua', 'Panama', 'Paraguay', 'Peru', 
                               'Trinidad and Tobago', 'United States', 'Uruguay', 'Venezuela']
            
            if any(keyword in country for keyword in africa_keywords):
                return 'Africa'
            elif any(keyword in country for keyword in europe_keywords):
                return 'Europe'
            elif any(keyword in country for keyword in asia_keywords):
                return 'Asia'
            elif any(keyword in country for keyword in americas_keywords):
                return 'Americas'
            else:
                return 'Other/Oceania'
        
        df['Region'] = df['Country'].apply(assign_region)
        
        # Store regional mapping for later use
        self.regional_mapping = df.groupby('Region')['Country'].unique().to_dict()
        
        regional_stats = df.groupby('Region').agg({
            'Life expectancy': ['count', 'mean'],
            'Country': 'nunique'
        }).round(2)
        
        print("Regional Distribution:")
        for region in df['Region'].unique():
            count = df[df['Region'] == region]['Country'].nunique()
            avg_life_exp = df[df['Region'] == region]['Life expectancy'].mean()
            print(f"  🗺️ {region}: {count} countries, avg life expectancy: {avg_life_exp:.1f} years")
        
        return df
    
    def implement_smart_missing_data_strategy(self, df):
        """Implement missing data strategy based on EDA insights"""
        print(f"\n🔧 SMART MISSING DATA IMPUTATION")
        print("-" * 50)
        
        df_imputed = df.copy()
        
        # High Priority Imputation (>15% missing)
        print("📊 HIGH PRIORITY IMPUTATION:")
        
        # 1. Population (22% missing) - Regional imputation
        if 'Population' in df_imputed.columns:
            print("  🏙️ Population: Regional median imputation")
            for region in df_imputed['Region'].unique():
                mask = (df_imputed['Region'] == region) & df_imputed['Population'].isnull()
                if mask.sum() > 0:
                    regional_median = df_imputed[df_imputed['Region'] == region]['Population'].median()
                    df_imputed.loc[mask, 'Population'] = regional_median
        
        # 2. Hepatitis B (19% missing) - Development status aware
        if 'Hepatitis B' in df_imputed.columns:
            print("  💉 Hepatitis B: Development status-aware imputation")
            for status in df_imputed['Status'].unique():
                mask = (df_imputed['Status'] == status) & df_imputed['Hepatitis B'].isnull()
                if mask.sum() > 0:
                    status_median = df_imputed[df_imputed['Status'] == status]['Hepatitis B'].median()
                    df_imputed.loc[mask, 'Hepatitis B'] = status_median
        
        # 3. GDP (15% missing) - Economic trend interpolation
        if 'GDP' in df_imputed.columns:
            print("  💰 GDP: Regional economic trend interpolation")
            for region in df_imputed['Region'].unique():
                region_data = df_imputed[df_imputed['Region'] == region].copy()
                if region_data['GDP'].isnull().sum() > 0:
                    # Use KNN imputation within region
                    region_features = ['Year', 'percentage expenditure', 'Total expenditure']
                    available_features = [f for f in region_features if f in region_data.columns]
                    
                    if len(available_features) > 1:
                        imputer = KNNImputer(n_neighbors=3)
                        region_data_subset = region_data[available_features + ['GDP']].copy()
                        imputed_values = imputer.fit_transform(region_data_subset)
                        df_imputed.loc[df_imputed['Region'] == region, 'GDP'] = imputed_values[:, -1]
        
        # Medium Priority Imputation (5-15% missing)
        print("\n📈 MEDIUM PRIORITY IMPUTATION:")
        medium_priority_cols = ['Total expenditure', 'Alcohol', 'Income composition of resources', 'Schooling']
        
        for col in medium_priority_cols:
            if col in df_imputed.columns and df_imputed[col].isnull().sum() > 0:
                print(f"  📋 {col}: Forward/backward fill by country with regional fallback")
                # Forward fill by country, then regional median
                df_imputed[col] = df_imputed.groupby('Country')[col].fillna(method='ffill').fillna(method='bfill')
                
                # Regional median for remaining missing values
                for region in df_imputed['Region'].unique():
                    mask = (df_imputed['Region'] == region) & df_imputed[col].isnull()
                    if mask.sum() > 0:
                        regional_median = df_imputed[df_imputed['Region'] == region][col].median()
                        df_imputed.loc[mask, col] = regional_median
        
        # Low Priority Imputation (<5% missing)
        print("\n📉 LOW PRIORITY IMPUTATION:")
        low_priority_cols = ['BMI', 'thinness  1-19 years', 'thinness 5-9 years', 'Polio', 'Diphtheria']
        
        for col in low_priority_cols:
            if col in df_imputed.columns and df_imputed[col].isnull().sum() > 0:
                print(f"  📋 {col}: Global median imputation")
                df_imputed[col].fillna(df_imputed[col].median(), inplace=True)
        
        # Report imputation results
        print(f"\n✅ IMPUTATION SUMMARY:")
        remaining_missing = df_imputed.isnull().sum()
        remaining_missing = remaining_missing[remaining_missing > 0]
        
        if len(remaining_missing) > 0:
            print("Remaining missing values:")
            for col, count in remaining_missing.items():
                percentage = (count / len(df_imputed)) * 100
                print(f"  ⚠️ {col}: {count} ({percentage:.1f}%)")
        else:
            print("🎉 All missing values successfully imputed!")
        
        return df_imputed
    
    def create_feature_engineered_variables(self, df):
        """Create feature-engineered variables based on EDA insights"""
        print(f"\n🔬 FEATURE ENGINEERING")
        print("-" * 50)
        
        df_engineered = df.copy()
        
        # 1. Health Access Index (Adult Mortality + healthcare indicators)
        print("🏥 Creating Health Access Index...")
        health_indicators = ['Adult Mortality', 'HIV/AIDS']
        available_health = [col for col in health_indicators if col in df_engineered.columns]
        
        if len(available_health) >= 2:
            # Normalize and create inverse index (lower is better for mortality/disease)
            health_normalized = df_engineered[available_health].copy()
            for col in available_health:
                health_normalized[col] = 1 - (health_normalized[col] - health_normalized[col].min()) / (health_normalized[col].max() - health_normalized[col].min())
            df_engineered['Health_Access_Index'] = health_normalized.mean(axis=1)
            print(f"  ✅ Health Access Index created (range: {df_engineered['Health_Access_Index'].min():.2f}-{df_engineered['Health_Access_Index'].max():.2f})")
        
        # 2. Education-Economy Index (Schooling + Income Composition)
        print("🎓 Creating Education-Economy Index...")
        edu_econ_indicators = ['Schooling', 'Income composition of resources']
        available_edu_econ = [col for col in edu_econ_indicators if col in df_engineered.columns]
        
        if len(available_edu_econ) >= 2:
            # Normalize and create composite index
            edu_econ_normalized = df_engineered[available_edu_econ].copy()
            for col in available_edu_econ:
                edu_econ_normalized[col] = (edu_econ_normalized[col] - edu_econ_normalized[col].min()) / (edu_econ_normalized[col].max() - edu_econ_normalized[col].min())
            df_engineered['Education_Economy_Index'] = edu_econ_normalized.mean(axis=1)
            print(f"  ✅ Education-Economy Index created (range: {df_engineered['Education_Economy_Index'].min():.2f}-{df_engineered['Education_Economy_Index'].max():.2f})")
        
        # 3. Vaccination Coverage Index
        print("💉 Creating Vaccination Coverage Index...")
        vaccination_cols = ['Hepatitis B', 'Polio', 'Diphtheria']
        available_vaccines = [col for col in vaccination_cols if col in df_engineered.columns]
        
        if len(available_vaccines) >= 2:
            # Average vaccination coverage
            df_engineered['Vaccination_Coverage_Index'] = df_engineered[available_vaccines].mean(axis=1)
            print(f"  ✅ Vaccination Coverage Index created (range: {df_engineered['Vaccination_Coverage_Index'].min():.1f}-{df_engineered['Vaccination_Coverage_Index'].max():.1f})")
        
        # 4. Regional dummy variables (based on EDA insights)
        print("🗺️ Creating Regional dummy variables...")
        regional_dummies = pd.get_dummies(df_engineered['Region'], prefix='Region')
        df_engineered = pd.concat([df_engineered, regional_dummies], axis=1)
        print(f"  ✅ Regional dummies created: {list(regional_dummies.columns)}")
        
        # 5. Development Status encoding
        print("🏭 Encoding Development Status...")
        df_engineered['Status_Developed'] = (df_engineered['Status'] == 'Developed').astype(int)
        print(f"  ✅ Development Status encoded (Developed=1, Developing=0)")
        
        # 6. Time-based features
        print("📅 Creating Time-based features...")
        df_engineered['Years_Since_2000'] = df_engineered['Year'] - 2000
        df_engineered['Year_Squared'] = df_engineered['Years_Since_2000'] ** 2  # For non-linear time trends
        print(f"  ✅ Time features created: Years_Since_2000, Year_Squared")
        
        print(f"\n✅ FEATURE ENGINEERING COMPLETE")
        print(f"📊 New dataset shape: {df_engineered.shape}")
        
        return df_engineered
    
    def implement_feature_selection(self, df):
        """Implement feature selection based on EDA insights"""
        print(f"\n🎯 FEATURE SELECTION")
        print("-" * 50)
        
        # Based on EDA insights - feature importance ranking
        tier1_features = ['Schooling', 'Adult Mortality', 'HIV/AIDS']
        tier2_features = ['Income composition of resources', 'BMI', 'GDP']
        tier3_features = ['Hepatitis B', 'Polio', 'Diphtheria', 'percentage expenditure']
        
        # Engineered features
        engineered_features = ['Health_Access_Index', 'Education_Economy_Index', 'Vaccination_Coverage_Index']
        
        # Regional and categorical features
        regional_features = [col for col in df.columns if col.startswith('Region_')]
        categorical_features = ['Status_Developed', 'Years_Since_2000']
        
        # Features to remove based on EDA (high multicollinearity)
        remove_features = ['under-five deaths', 'infant deaths']  # Keep only infant deaths (0.997 correlation)
        
        # Combine selected features
        selected_features = []
        
        print("🥇 TIER 1 FEATURES (Highest Importance):")
        for feature in tier1_features:
            if feature in df.columns:
                selected_features.append(feature)
                correlation = df[feature].corr(df['Life expectancy']) if 'Life expectancy' in df.columns else 'N/A'
                print(f"  ✅ {feature} (correlation: {correlation:.3f})")
        
        print("\n🥈 TIER 2 FEATURES (Strong Predictors):")
        for feature in tier2_features:
            if feature in df.columns:
                selected_features.append(feature)
                correlation = df[feature].corr(df['Life expectancy']) if 'Life expectancy' in df.columns else 'N/A'
                print(f"  ✅ {feature} (correlation: {correlation:.3f})")
        
        print("\n🥉 TIER 3 FEATURES (Moderate Predictors):")
        for feature in tier3_features:
            if feature in df.columns:
                selected_features.append(feature)
                correlation = df[feature].corr(df['Life expectancy']) if 'Life expectancy' in df.columns else 'N/A'
                print(f"  ✅ {feature} (correlation: {correlation:.3f})")
        
        print("\n🔬 ENGINEERED FEATURES:")
        for feature in engineered_features:
            if feature in df.columns:
                selected_features.append(feature)
                correlation = df[feature].corr(df['Life expectancy']) if 'Life expectancy' in df.columns else 'N/A'
                print(f"  ✅ {feature} (correlation: {correlation:.3f})")
        
        print("\n🗺️ REGIONAL & CATEGORICAL FEATURES:")
        selected_features.extend(regional_features)
        selected_features.extend(categorical_features)
        for feature in regional_features + categorical_features:
            if feature in df.columns:
                print(f"  ✅ {feature}")
        
        print("\n❌ FEATURES REMOVED (High Multicollinearity):")
        for feature in remove_features:
            if feature in df.columns:
                print(f"  🗑️ {feature} (removed due to multicollinearity)")
        
        # Always include target variable and identifiers
        essential_features = ['Country', 'Year', 'Life expectancy', 'Status', 'Region']
        all_features = essential_features + selected_features
        
        # Remove duplicates and ensure features exist
        final_features = []
        for feature in all_features:
            if feature in df.columns and feature not in final_features:
                final_features.append(feature)
        
        # Create final dataset
        df_final = df[final_features].copy()
        
        print(f"\n✅ FEATURE SELECTION COMPLETE")
        print(f"📊 Original features: {df.shape[1]}")
        print(f"📊 Selected features: {len(final_features)}")
        print(f"📊 Final dataset shape: {df_final.shape}")
        
        return df_final, final_features
    
    def generate_preprocessing_report(self, original_df, final_df):
        """Generate comprehensive preprocessing report"""
        print(f"\n📋 PREPROCESSING REPORT")
        print("=" * 80)
        
        print(f"📊 DATA TRANSFORMATION SUMMARY:")
        print(f"  Original dataset: {original_df.shape}")
        print(f"  Final dataset: {final_df.shape}")
        print(f"  Records retained: {len(final_df):,} ({len(final_df)/len(original_df)*100:.1f}%)")
        
        print(f"\n🔍 MISSING DATA RESOLUTION:")
        original_missing = original_df.isnull().sum().sum()
        final_missing = final_df.isnull().sum().sum()
        print(f"  Original missing values: {original_missing:,}")
        print(f"  Final missing values: {final_missing:,}")
        print(f"  Missing data reduction: {((original_missing - final_missing) / original_missing * 100):.1f}%")
        
        print(f"\n🎯 FEATURE ENGINEERING:")
        new_features = [col for col in final_df.columns if col not in original_df.columns]
        print(f"  New features created: {len(new_features)}")
        for feature in new_features:
            print(f"    ✅ {feature}")
        
        print(f"\n🏆 READY FOR MODELING:")
        print(f"  ✅ Missing data handled strategically")
        print(f"  ✅ Features engineered based on EDA insights") 
        print(f"  ✅ Feature selection implemented")
        print(f"  ✅ Regional and categorical encoding complete")
        print(f"  ✅ Dataset optimized for regression modeling")
        
        return {
            'original_shape': original_df.shape,
            'final_shape': final_df.shape,
            'missing_reduction': ((original_missing - final_missing) / original_missing * 100),
            'new_features': new_features,
            'records_retained': len(final_df)/len(original_df)*100
        }

def run_preprocessing_pipeline():
    """Run the complete preprocessing pipeline"""
    
    # Initialize preprocessor
    preprocessor = LifeExpectancyPreprocessor()
    
    # Step 1: Load and setup data
    df = preprocessor.load_and_setup_data()
    original_df = df.copy()
    
    # Step 2: Analyze missing data patterns
    missing_summary = preprocessor.analyze_missing_data_patterns(df)
    
    # Step 3: Create regional mapping
    df = preprocessor.create_regional_mapping(df)
    
    # Step 4: Implement smart missing data strategy
    df = preprocessor.implement_smart_missing_data_strategy(df)
    
    # Step 5: Create feature-engineered variables
    df = preprocessor.create_feature_engineered_variables(df)
    
    # Step 6: Implement feature selection
    df_final, selected_features = preprocessor.implement_feature_selection(df)
    
    # Step 7: Generate preprocessing report
    report = preprocessor.generate_preprocessing_report(original_df, df_final)
    
    # Save processed dataset
    df_final.to_csv('Life_Expectancy_Processed.csv', index=False)
    print(f"\n💾 Processed dataset saved as 'Life_Expectancy_Processed.csv'")
    
    return df_final, selected_features, report

if __name__ == "__main__":
    # Run the complete preprocessing pipeline
    processed_df, features, preprocessing_report = run_preprocessing_pipeline()
    
    print("\n" + "="*80)
    print("🎉 PHASE 2A PREPROCESSING COMPLETED!")
    print("="*80)
    print("✅ Ready for Phase 2B: Quick Model Validation")
    print("🚀 Next step: Build baseline models to test EDA predictions")
    print("="*80) 

PHASE 2A: SMART DATA PREPROCESSING
📊 Initial Dataset: 2938 records, 22 features
📅 Time period: 2000-2015
🌍 Countries: 193

🔍 MISSING DATA ANALYSIS
--------------------------------------------------
HIGH PRIORITY (>15% missing):
  📊 Population: 652 (22.2%)
  📊 Hepatitis B: 553 (18.8%)
  📊 GDP: 448 (15.2%)

MEDIUM PRIORITY (5-15% missing):
  📈 Total expenditure: 226 (7.7%)
  📈 Alcohol: 194 (6.6%)
  📈 Income composition of resources: 167 (5.7%)
  📈 Schooling: 163 (5.5%)

🌍 REGIONAL MAPPING CREATION
--------------------------------------------------
Regional Distribution:
  🗺️ Asia: 36 countries, avg life expectancy: 70.5 years
  🗺️ Europe: 31 countries, avg life expectancy: 79.0 years
  🗺️ Africa: 31 countries, avg life expectancy: 57.3 years
  🗺️ Other/Oceania: 71 countries, avg life expectancy: 67.7 years
  🗺️ Americas: 24 countries, avg life expectancy: 74.0 years

🔧 SMART MISSING DATA IMPUTATION
--------------------------------------------------
📊 HIGH PRIORITY IMPUTATION:
  🏙️ Popula