# 02 - Feature Engineering

This notebook creates advanced features:
- Temporal features (cyclical encoding, seasons)
- Spatial features (coordinates, quadrants)
- Landsat-derived spectral indices (NDVI, NDWI, NBR, EVI, etc.)
- Climate-derived features
- Interaction features

The engineered dataset is saved for downstream modeling.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_loading import (
    load_water_quality_data,
    load_landsat_data,
    load_terraclimate_data,
    merge_all_datasets,
    handle_missing_values,
    save_processed_data
)
from feature_engineering import (
    create_temporal_features,
    create_spatial_features,
    create_landsat_indices,
    create_climate_features,
    create_interaction_features,
    create_aggregation_features
)
from utils import reduce_mem_usage

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load and Merge Data

In [None]:
# Load all datasets
train_wq, test_wq, _ = load_water_quality_data()
train_landsat, test_landsat = load_landsat_data()
train_climate, test_climate = load_terraclimate_data()

In [None]:
# Merge datasets
train = merge_all_datasets(train_wq, train_landsat, train_climate)
test = merge_all_datasets(test_wq, test_landsat, test_climate)

print(f"Training data: {train.shape}")
print(f"Test data: {test.shape}")

## 2. Create Temporal Features

In [None]:
# Create temporal features from date column
if 'date' in train.columns:
    train = create_temporal_features(train, date_col='date')
    test = create_temporal_features(test, date_col='date')
    print(f"\nTemporal features added. New shape: {train.shape}")
else:
    print("Warning: 'date' column not found. Skipping temporal features.")

In [None]:
# Visualize temporal patterns
if 'month' in train.columns and 'target' in train.columns:
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    train.groupby('month')['target'].mean().plot(kind='bar')
    plt.title('Average Target by Month')
    plt.xlabel('Month')
    plt.ylabel('Target Value')
    
    plt.subplot(1, 2, 2)
    train.groupby('season')['target'].mean().plot(kind='bar')
    plt.title('Average Target by Season')
    plt.xlabel('Season (1=Winter, 2=Spring, 3=Summer, 4=Fall)')
    plt.ylabel('Target Value')
    
    plt.tight_layout()
    plt.savefig('../outputs/figures/temporal_patterns.png', dpi=300, bbox_inches='tight')
    plt.show()

## 3. Create Spatial Features

In [None]:
# Create spatial features from lat/lon
if 'latitude' in train.columns and 'longitude' in train.columns:
    train = create_spatial_features(train)
    test = create_spatial_features(test)
    print(f"\nSpatial features added. New shape: {train.shape}")
else:
    print("Warning: Latitude/Longitude columns not found.")

## 4. Create Landsat Spectral Indices

In [None]:
# Calculate vegetation, water, and burn indices
train = create_landsat_indices(train)
test = create_landsat_indices(test)

print(f"\nLandsat indices added. New shape: {train.shape}")

In [None]:
# Visualize spectral indices
spectral_indices = ['NDVI', 'NDWI', 'NBR', 'EVI']
available_indices = [idx for idx in spectral_indices if idx in train.columns]

if available_indices and 'target' in train.columns:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.flatten()
    
    for i, idx in enumerate(available_indices[:4]):
        axes[i].scatter(train[idx], train['target'], alpha=0.3, s=1)
        axes[i].set_xlabel(idx)
        axes[i].set_ylabel('Target')
        axes[i].set_title(f'{idx} vs Target')
    
    plt.tight_layout()
    plt.savefig('../outputs/figures/spectral_indices.png', dpi=300, bbox_inches='tight')
    plt.show()

## 5. Create Climate-Derived Features

In [None]:
# Create climate features
train = create_climate_features(train)
test = create_climate_features(test)

print(f"\nClimate features added. New shape: {train.shape}")

## 6. Create Interaction Features

In [None]:
# Define important feature interactions
interaction_pairs = []

if 'NDVI' in train.columns and 'ppt' in train.columns:
    interaction_pairs.append(('NDVI', 'ppt'))

if 'NDWI' in train.columns and 'soil' in train.columns:
    interaction_pairs.append(('NDWI', 'soil'))

if 'tmax' in train.columns and 'ppt' in train.columns:
    interaction_pairs.append(('tmax', 'ppt'))

# Create interactions
train = create_interaction_features(train, interaction_pairs)
test = create_interaction_features(test, interaction_pairs)

print(f"\nInteraction features added. New shape: {train.shape}")

## 7. Create Aggregation Features

In [None]:
# Create spatial aggregations
if 'quadrant' in train.columns:
    agg_cols = [col for col in ['NDVI', 'ppt', 'tmax'] if col in train.columns]
    
    if agg_cols:
        train = create_aggregation_features(
            train,
            group_col='quadrant',
            agg_cols=agg_cols,
            agg_funcs=['mean', 'std']
        )
        
        test = create_aggregation_features(
            test,
            group_col='quadrant',
            agg_cols=agg_cols,
            agg_funcs=['mean', 'std']
        )
        
        print(f"\nAggregation features added. New shape: {train.shape}")

## 8. Handle Missing Values

In [None]:
# Fill any remaining missing values
train = handle_missing_values(train, strategy='median')
test = handle_missing_values(test, strategy='median')

## 9. Reduce Memory Usage

In [None]:
# Optimize memory usage
print("Optimizing memory usage...")
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

## 10. Save Engineered Datasets

In [None]:
# Save to parquet for efficient storage
save_processed_data(train, '../data/processed/train_engineered.parquet', format='parquet')
save_processed_data(test, '../data/processed/test_engineered.parquet', format='parquet')

print(f"\nFinal engineered training data: {train.shape}")
print(f"Final engineered test data: {test.shape}")
print(f"Total features created: {train.shape[1]}")

## 11. Feature Summary

In [None]:
# Display feature categories
print("\nFeature Summary:")
print("="*50)

temporal_features = [col for col in train.columns if any(x in col.lower() for x in ['year', 'month', 'day', 'season', 'quarter'])]
spatial_features = [col for col in train.columns if any(x in col.lower() for x in ['lat', 'lon', 'distance', 'quadrant'])]
spectral_features = [col for col in train.columns if any(x in col for x in ['NDVI', 'NDWI', 'NBR', 'EVI', 'NDBI', 'MNDWI', 'SAVI'])]
climate_features = [col for col in train.columns if any(x in col.lower() for x in ['temp', 'ppt', 'soil', 'vpd', 'aridity'])]
interaction_features = [col for col in train.columns if '_x_' in col]

print(f"Temporal features: {len(temporal_features)}")
print(f"Spatial features: {len(spatial_features)}")
print(f"Spectral indices: {len(spectral_features)}")
print(f"Climate features: {len(climate_features)}")
print(f"Interaction features: {len(interaction_features)}")
print(f"\nTotal features: {train.shape[1]}")

## Summary

This notebook successfully engineered a comprehensive feature set including:
- **Temporal**: Cyclical encoding, seasons, day of year
- **Spatial**: Coordinate transformations, quadrants, distance metrics
- **Spectral**: NDVI, NDWI, NBR, EVI, and other vegetation/water indices
- **Climate**: Temperature range, aridity index, water balance
- **Interactions**: Key feature combinations

The engineered datasets are saved for use in subsequent modeling notebooks.