# California Housing - Data Processing & Feature Engineering

This notebook implements comprehensive data preprocessing and feature engineering for the California housing dataset, preparing the data for machine learning model training.

## Objectives
1. Load and clean the California housing dataset
2. Handle any missing values and data quality issues
3. Engineer meaningful features specific to housing data
4. Create geographic, density, and interaction features
5. Scale and encode features appropriately
6. Split data into training and validation sets
7. Save processed data for model training

## 1. Setup and Data Loading

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings

# Custom modules
import sys
sys.path.append('..')

from src.data_loader import load_data_with_fallback, validate_dataset
from src.data_pipeline import CaliforniaHousingPipeline
from src.data_cleaning import DataCleaner
from src.feature_engineering import CaliforniaHousingFeatureEngineer
from src.missing_value_handler import MissingValueHandler

# Configure display and plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

print("🏠 California Housing Data Processing Environment Setup Complete")

In [None]:
# Load California housing data
print("📊 Loading California housing dataset...")
train_data, test_data = load_data_with_fallback()

# Validate dataset
is_valid = validate_dataset(train_data, test_data)

if is_valid:
    print(f"\n✅ California housing dataset loaded successfully!")
    print(f"📈 Training data: {train_data.shape}")
    print(f"📊 Test data: {test_data.shape}")
    print(f"🎯 Features: {train_data.shape[1] - 1}")
    
    # Display column information
    print(f"\n📋 Dataset Columns:")
    for i, col in enumerate(train_data.columns, 1):
        dtype = train_data[col].dtype
        print(f"  {i:2d}. {col:<25} ({dtype})")
else:
    print("❌ Dataset validation failed!")
    raise ValueError("Cannot proceed with invalid dataset")

## 2. Initial Data Overview

In [None]:
# Display basic statistics
print("📊 CALIFORNIA HOUSING DATASET OVERVIEW")
print("=" * 60)

# Basic info
print(f"\nDataset Information:")
print(f"  • Total samples: {len(train_data):,}")
print(f"  • Total features: {train_data.shape[1] - 1}")
print(f"  • Memory usage: {train_data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  • Missing values: {train_data.isnull().sum().sum()}")

# Target variable info
if 'median_house_value' in train_data.columns:
    target = train_data['median_house_value']
    print(f"\nTarget Variable (median_house_value):")
    print(f"  • Mean: ${target.mean():,.0f}")
    print(f"  • Median: ${target.median():,.0f}")
    print(f"  • Range: ${target.min():,.0f} - ${target.max():,.0f}")
    print(f"  • Standard deviation: ${target.std():,.0f}")

# Feature types
print(f"\nFeature Types:")
print(f"  • Numerical: {len(train_data.select_dtypes(include=[np.number]).columns)}")
print(f"  • Categorical: {len(train_data.select_dtypes(include=['object', 'string']).columns)}")

In [None]:
# Quick visualization of target and key features
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Target distribution
if 'median_house_value' in train_data.columns:
    target = train_data['median_house_value']
    
    axes[0,0].hist(target, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0,0].set_title('House Value Distribution')
    axes[0,0].set_xlabel('Median House Value ($)')
    axes[0,0].set_ylabel('Frequency')
    
    # Log distribution
    axes[0,1].hist(np.log1p(target), bins=50, alpha=0.7, color='lightgreen', edgecolor='black')
    axes[0,1].set_title('Log(House Value) Distribution')
    axes[0,1].set_xlabel('Log(Median House Value)')
    axes[0,1].set_ylabel('Frequency')

# Key feature distributions
key_features = ['median_income', 'total_rooms', 'housing_median_age', 'population']
existing_features = [f for f in key_features if f in train_data.columns]

for i, feature in enumerate(existing_features[:4]):
    if i < 4:
        row = (i + 2) // 3
        col = (i + 2) % 3
        
        train_data[feature].hist(bins=30, ax=axes[row, col], alpha=0.7, color='coral')
        axes[row, col].set_title(f'{feature} Distribution')
        axes[row, col].set_xlabel(feature)
        axes[row, col].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 3. Complete Preprocessing Pipeline

In [None]:
# Initialize and run the complete preprocessing pipeline
print("🚀 Initializing California Housing Preprocessing Pipeline...")

pipeline = CaliforniaHousingPipeline(train_data, test_data)

# Show pipeline configuration
print(f"\n⚙️ Pipeline Configuration:")
for key, value in pipeline.pipeline_config.items():
    print(f"  • {key}: {value}")

print(f"\n🔄 Running complete preprocessing pipeline...")
results = pipeline.run_pipeline(save_processed=True)

## 4. Feature Engineering Results Analysis

In [None]:
# Analyze feature engineering results
if pipeline.feature_engineer:
    feature_summary = pipeline.feature_engineer.get_feature_summary()
    pipeline.feature_engineer.print_feature_summary()
    
    print(f"\n🔍 Detailed Feature Analysis:")
    print(f"  • Original California housing features: {feature_summary['original_features']}")
    print(f"  • Engineered features: {feature_summary['total_created']}")
    print(f"  • Total final features: {feature_summary['final_feature_count']}")
    
    # Show created features by category
    categories = feature_summary['feature_categories']
    
    for category, features in categories.items():
        if features:
            print(f"\n  📊 {category.title()} Features ({len(features)}):")
            for feature in features:
                print(f"    - {feature}")
else:
    print("⚠️ Feature engineering not completed")

In [None]:
# Visualize preprocessing results
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Original vs processed feature count
if 'original_train_shape' in results and 'X_train_shape' in results:
    categories = ['Original Features', 'Processed Features']
    feature_counts = [results['original_train_shape'][1] - 1, results['feature_count']]
    
    axes[0,0].bar(categories, feature_counts, color=['lightcoral', 'lightblue'])
    axes[0,0].set_title('Feature Count: Before vs After Engineering')
    axes[0,0].set_ylabel('Number of Features')
    
    # Add value labels on bars
    for i, v in enumerate(feature_counts):
        axes[0,0].text(i, v + 0.5, str(v), ha='center', va='bottom', fontweight='bold')

# Sample size changes
if 'original_train_shape' in results and 'X_train_shape' in results:
    categories = ['Original Training', 'Final Training', 'Validation']
    sample_counts = [
        results['original_train_shape'][0],
        results['X_train_shape'][0], 
        results['X_val_shape'][0]
    ]
    
    axes[0,1].bar(categories, sample_counts, color=['lightcoral', 'lightblue', 'lightgreen'])
    axes[0,1].set_title('Sample Count: Original vs Final Splits')
    axes[0,1].set_ylabel('Number of Samples')
    axes[0,1].tick_params(axis='x', rotation=45)

# Target distribution before and after processing
if pipeline.y_train is not None:
    # Original target
    original_target = train_data['median_house_value'] if 'median_house_value' in train_data.columns else None
    if original_target is not None:
        axes[1,0].hist(original_target, bins=30, alpha=0.7, color='skyblue', label='Original')
    
    # Processed target
    axes[1,0].hist(pipeline.y_train, bins=30, alpha=0.7, color='lightcoral', label='Processed')
    axes[1,0].set_title('Target Distribution: Original vs Processed')
    axes[1,0].set_xlabel('House Value ($)')
    axes[1,0].set_ylabel('Frequency')
    axes[1,0].legend()

# Feature correlation with target (top 10)
if pipeline.X_train is not None and pipeline.y_train is not None:
    # Calculate correlations
    combined_data = pipeline.X_train.copy()
    combined_data[pipeline.target_col] = pipeline.y_train
    
    correlations = combined_data.corr()[pipeline.target_col].drop(pipeline.target_col)
    top_corr = correlations.abs().sort_values(ascending=True).tail(10)
    
    # Color by positive/negative correlation
    colors = ['red' if correlations[feature] < 0 else 'green' for feature in top_corr.index]
    
    top_corr.plot(kind='barh', ax=axes[1,1], color=colors)
    axes[1,1].set_title('Top 10 Features Correlated with House Value')
    axes[1,1].set_xlabel('Absolute Correlation')

plt.tight_layout()
plt.show()

## 5. Feature Engineering Deep Dive

In [None]:
# Analyze individual feature engineering steps
if pipeline.feature_engineer:
    feature_engineer = pipeline.feature_engineer
    
    print("🔍 FEATURE ENGINEERING ANALYSIS")
    print("=" * 60)
    
    # Test individual feature creation methods
    print("\n🗺️ Testing Geographic Features...")
    temp_train, temp_test = feature_engineer.create_geographic_features()
    geo_features = [col for col in temp_train.columns if col not in train_data.columns]
    print(f"Created geographic features: {geo_features}")
    
    print("\n🏘️ Testing Density Features...")
    feature_engineer.train_data = temp_train
    feature_engineer.test_data = temp_test
    temp_train, temp_test = feature_engineer.create_housing_density_features()
    density_features = [col for col in temp_train.columns if col not in feature_engineer.train_data.columns]
    print(f"Created density features: {density_features}")
    
    print("\n💰 Testing Income Features...")
    feature_engineer.train_data = temp_train
    feature_engineer.test_data = temp_test
    temp_train, temp_test = feature_engineer.create_income_features()
    income_features = [col for col in temp_train.columns if col not in feature_engineer.train_data.columns]
    print(f"Created income features: {income_features}")

In [None]:
# Visualize some of the engineered features
if pipeline.X_train is not None:
    # Plot some key engineered features
    engineered_features = ['rooms_per_household', 'population_per_household', 'bedrooms_per_room']
    existing_eng_features = [f for f in engineered_features if f in pipeline.X_train.columns]
    
    if existing_eng_features:
        n_features = len(existing_eng_features)
        fig, axes = plt.subplots(1, min(3, n_features), figsize=(15, 5))
        if n_features == 1:
            axes = [axes]
        
        for i, feature in enumerate(existing_eng_features[:3]):
            if i < len(axes):
                pipeline.X_train[feature].hist(bins=30, ax=axes[i], alpha=0.7, color='lightgreen')
                axes[i].set_title(f'{feature} Distribution')
                axes[i].set_xlabel(feature.replace('_', ' ').title())
                axes[i].set_ylabel('Frequency')
        
        plt.suptitle('Engineered Feature Distributions')
        plt.tight_layout()
        plt.show()
    
    # Geographic features visualization
    geo_features = [f for f in pipeline.X_train.columns if 'distance' in f or f in ['is_northern_ca', 'is_coastal']]
    
    if geo_features:
        print(f"\n🗺️ Geographic Features Created: {len(geo_features)}")
        for feature in geo_features:
            print(f"  • {feature}")

## 6. Data Quality After Processing

In [None]:
# Analyze data quality after processing
if pipeline.X_train is not None:
    from src.data_quality import DataQualityAssessor
    
    # Combine features and target for quality assessment
    processed_data = pipeline.X_train.copy()
    processed_data[pipeline.target_col] = pipeline.y_train
    
    quality_assessor = DataQualityAssessor(processed_data, "Processed California Housing")
    quality_report = quality_assessor.generate_quality_report()
    
    quality_assessor.print_summary()
else:
    print("⚠️ Processed data not available for quality assessment")

## 7. Feature Correlation Analysis

In [None]:
# Analyze correlations after feature engineering
if pipeline.X_train is not None and pipeline.y_train is not None:
    print("🔗 FEATURE CORRELATION ANALYSIS")
    print("=" * 60)
    
    # Create correlation matrix
    correlation_data = pipeline.X_train.copy()
    correlation_data[pipeline.target_col] = pipeline.y_train
    
    # Calculate correlations with target
    target_correlations = correlation_data.corr()[pipeline.target_col].drop(pipeline.target_col)
    
    # Top positive and negative correlations
    top_positive = target_correlations.sort_values(ascending=False).head(10)
    top_negative = target_correlations.sort_values(ascending=True).head(5)
    
    print(f"\n📈 Top 10 Positive Correlations:")
    for feature, corr in top_positive.items():
        print(f"  • {feature:<30}: {corr:+.3f}")
    
    if top_negative.iloc[0] < -0.1:  # Only show if there are meaningful negative correlations
        print(f"\n📉 Top Negative Correlations:")
        for feature, corr in top_negative.items():
            if corr < -0.1:
                print(f"  • {feature:<30}: {corr:+.3f}")
    
    # Visualize top correlations
    plt.figure(figsize=(12, 8))
    
    # Combine top positive and meaningful negative correlations
    top_features = top_positive.head(15)
    
    colors = ['green' if x > 0 else 'red' for x in top_features.values]
    
    top_features.plot(kind='barh', color=colors, figsize=(12, 8))
    plt.title('Top 15 Features Correlated with House Value')
    plt.xlabel('Correlation Coefficient')
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.tight_layout()
    plt.show()

## 8. Train-Validation Split Analysis

In [None]:
# Analyze the train-validation split
if all(x is not None for x in [pipeline.X_train, pipeline.X_val, pipeline.y_train, pipeline.y_val]):
    print("📊 TRAIN-VALIDATION SPLIT ANALYSIS")
    print("=" * 60)
    
    print(f"\nSplit Summary:")
    print(f"  • Training samples: {len(pipeline.X_train):,} ({len(pipeline.X_train)/(len(pipeline.X_train)+len(pipeline.X_val))*100:.1f}%)")
    print(f"  • Validation samples: {len(pipeline.X_val):,} ({len(pipeline.X_val)/(len(pipeline.X_train)+len(pipeline.X_val))*100:.1f}%)")
    print(f"  • Features: {pipeline.X_train.shape[1]}")
    
    # Compare target distributions
    print(f"\nTarget Variable Comparison:")
    print(f"  Training - Mean: ${pipeline.y_train.mean():,.0f}, Std: ${pipeline.y_train.std():,.0f}")
    print(f"  Validation - Mean: ${pipeline.y_val.mean():,.0f}, Std: ${pipeline.y_val.std():,.0f}")
    
    # Visualize target distribution in both sets
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Training target distribution
    axes[0].hist(pipeline.y_train, bins=30, alpha=0.7, color='lightblue', label='Training')
    axes[0].hist(pipeline.y_val, bins=30, alpha=0.7, color='lightcoral', label='Validation')
    axes[0].set_title('Target Distribution: Training vs Validation')
    axes[0].set_xlabel('House Value ($)')
    axes[0].set_ylabel('Frequency')
    axes[0].legend()
    
    # Box plot comparison
    data_for_box = [
        pipeline.y_train.values,
        pipeline.y_val.values
    ]
    
    axes[1].boxplot(data_for_box, labels=['Training', 'Validation'])
    axes[1].set_title('Target Distribution Box Plot')
    axes[1].set_ylabel('House Value ($)')
    
    plt.tight_layout()
    plt.show()
    
    # Statistical test for distribution similarity
    from scipy import stats
    
    # Kolmogorov-Smirnov test
    ks_statistic, ks_p_value = stats.ks_2samp(pipeline.y_train, pipeline.y_val)
    
    print(f"\nDistribution Similarity Test:")
    print(f"  • KS statistic: {ks_statistic:.4f}")
    print(f"  • P-value: {ks_p_value:.4f}")
    
    if ks_p_value > 0.05:
        print(f"  ✅ Distributions are similar (good split)")
    else:
        print(f"  ⚠️ Distributions may be different")
else:
    print("⚠️ Split data not available for analysis")

## 9. Final Data Preparation Summary

In [None]:
# Generate comprehensive processing summary
processing_summary = pipeline.get_processing_summary()
print(processing_summary)

# Pipeline information
pipeline_info = pipeline.get_pipeline_info()

print(f"\n🎯 PIPELINE STATUS SUMMARY")
print(f"=" * 60)
print(f"Pipeline fitted: {'✅ Yes' if pipeline_info['is_fitted'] else '❌ No'}")
print(f"Target column: {pipeline_info['target_column']}")
print(f"Final feature count: {pipeline_info['feature_count']}")
print(f"Training samples: {pipeline_info['training_samples']:,}")
print(f"Validation samples: {pipeline_info['validation_samples']:,}")
print(f"Created features: {pipeline_info['created_features']}")
print(f"Processing steps completed: {pipeline_info['processing_steps']}")

## 10. Data Export and Preparation for Modeling

In [None]:
# Verify processed data is saved and ready for modeling
from pathlib import Path
from config.settings import PROCESSED_DATA_DIR

print("💾 PROCESSED DATA VERIFICATION")
print("=" * 60)

# Check what files were saved
if PROCESSED_DATA_DIR.exists():
    saved_files = list(PROCESSED_DATA_DIR.glob('*.csv')) + list(PROCESSED_DATA_DIR.glob('*.pkl'))
    
    print(f"\n📁 Files saved in {PROCESSED_DATA_DIR}:")
    for file_path in saved_files:
        file_size = file_path.stat().st_size / 1024  # Size in KB
        print(f"  • {file_path.name:<25} ({file_size:.1f} KB)")
    
    # Test loading processed data
    print(f"\n🧪 Testing data loading...")
    try:
        loaded_datasets = pipeline.load_processed_data()
        
        print(f"  ✅ Successfully loaded:")
        for dataset_name, dataset in loaded_datasets.items():
            if isinstance(dataset, pd.DataFrame):
                print(f"    - {dataset_name}: {dataset.shape}")
            elif isinstance(dataset, pd.Series):
                print(f"    - {dataset_name}: ({len(dataset)},)")
            else:
                print(f"    - {dataset_name}: {type(dataset).__name__}")
                
    except Exception as e:
        print(f"  ❌ Error loading data: {e}")
        
else:
    print(f"⚠️ Processed data directory not found: {PROCESSED_DATA_DIR}")

print(f"\n✅ Data preprocessing completed and ready for Phase 4: Model Training!")
print(f"🎯 Next: Train multiple ML models on the processed California housing data")

## 11. Key Insights for Modeling

In [None]:
# Provide insights for model training phase
print("💡 KEY INSIGHTS FOR MODEL TRAINING")
print("=" * 60)

if pipeline.X_train is not None and pipeline.y_train is not None:
    # Feature importance insights
    correlation_data = pipeline.X_train.copy()
    correlation_data[pipeline.target_col] = pipeline.y_train
    target_corr = correlation_data.corr()[pipeline.target_col].drop(pipeline.target_col)
    
    strong_features = target_corr[abs(target_corr) > 0.3]
    moderate_features = target_corr[(abs(target_corr) > 0.1) & (abs(target_corr) <= 0.3)]
    
    print(f"\n🎯 Modeling Recommendations:")
    
    if len(strong_features) > 0:
        print(f"  • {len(strong_features)} features with strong correlation (>0.3):")
        for feature, corr in strong_features.abs().sort_values(ascending=False).items():
            print(f"    - {feature}: {target_corr[feature]:+.3f}")
    
    print(f"\n  • Total features available: {pipeline.X_train.shape[1]}")
    print(f"  • Data is clean and ready for training")
    print(f"  • Recommended models: Linear Regression, Random Forest, XGBoost")
    print(f"  • Target is {'left-skewed' if pipeline.y_train.skew() < -0.5 else 'right-skewed' if pipeline.y_train.skew() > 0.5 else 'approximately normal'}")
    
    target_skewness = pipeline.y_train.skew()
    if abs(target_skewness) > 1:
        print(f"  • Consider log transformation of target (skewness: {target_skewness:.2f})")
    
    print(f"\n🚀 Ready to proceed to Phase 4: Model Development & Training!")

else:
    print("⚠️ Pipeline not completed - cannot provide modeling insights")