# CatBoost Dasymetric Poverty Mapping Notebook

## Checklist
- Load and validate input feature and target datasets
- Prepare training matrix (fractional assignment already baked into engineered dataset)
- Train CatBoost with cross-validation and capture metrics
- Generate predictions and evaluation plots
- Save all tabular outputs (CSV) and visualizations (PNG)
- Copy Random Forest outputs and create comparison manifest


In [None]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Directories - relative to notebook location
OUTPUT_ROOT = Path('../output/catBoost')
DATA_OUT = OUTPUT_ROOT / 'data'
VIS_OUT = OUTPUT_ROOT / 'visualizations'
RF_OUT = OUTPUT_ROOT / 'randomForest_outputs'
for d in [DATA_OUT, VIS_OUT, RF_OUT]:
    d.mkdir(parents=True, exist_ok=True)

# Helper: fail fast with clear message
def abort(msg: str):
    print(f"ERROR: {msg}")
    raise SystemExit(1)

print("=== CATBOOST DASYMETRIC POVERTY MAPPING ===\n")

# Locate engineered dataset - PRIORITIZE spatially-imputed for fair comparison
ENGINEERED_PATHS = [
    Path('../assets/grid_features_spatial_imputed.csv'),  # FIRST: Same as RandomForest
    Path('../assets/grid_with_comprehensive_data.csv'),
    Path('../assets/grid_features_engineered.csv'),
]

print("Checking for engineered dataset:")
engineered_file = None
for p in ENGINEERED_PATHS:
    exists = p.exists()
    print(f"  {p}: {'✓ Found' if exists else '✗ Not found'}")
    if exists and engineered_file is None:
        engineered_file = p

if engineered_file is None:
    abort('No engineered dataset found. Please run the data preparation notebooks first.')

print(f"\n✓ Using dataset: {engineered_file}")
if 'spatial_imputed' in str(engineered_file):
    print("  ✅ Using same spatially-imputed data as RandomForest for fair comparison")

# Load dataset
try:
    df = pd.read_csv(engineered_file)
    print(f"✓ Loaded {df.shape[0]} rows and {df.shape[1]} columns")
except Exception as e:
    abort(f'Failed to read dataset: {e}')

# Check for poverty_rate column (same as RandomForest)
if 'poverty_rate' not in df.columns:
    abort(f"Dataset missing 'poverty_rate' column. Available columns: {list(df.columns)}")

TARGET = 'poverty_rate'
print(f"✓ Target column: {TARGET}")

# Filter to rows with poverty data (training set)
df_train = df[df[TARGET].notna()].copy()
print(f"✓ Training samples (with poverty data): {len(df_train)}")

if len(df_train) == 0:
    abort("No training samples found (all poverty_rate values are null)")

# Prepare features - exclude non-feature columns (same as RandomForest)
EXCLUDE_COLS = [
    TARGET, 'grid_id', '.geo', 'system:index', 'x_idx', 'y_idx', 
    'geometry', 'barangay_name', 'lon', 'lat', 'centroid', 'cluster'
]
feature_cols = [c for c in df_train.columns if c not in EXCLUDE_COLS]
print(f"✓ Feature columns: {len(feature_cols)}")

X = df_train[feature_cols].copy()
y = df_train[TARGET].copy()

# Handle missing values (mean imputation for consistency)
missing_before = X.isnull().sum().sum()
if missing_before > 0:
    print(f"⚠ Filling {missing_before} missing values with column means")
    X = X.apply(lambda s: s.fillna(s.mean()) if s.dtype in ['float64', 'int64'] else s)
else:
    print(f"✓ No missing values - spatial imputation already applied")

# Identify categorical features
cat_features = []
for i, col in enumerate(feature_cols):
    if X[col].dtype == 'object' or (X[col].dtype in ['int64', 'float64'] and X[col].nunique() < 10):
        cat_features.append(i)

print(f"✓ Categorical features detected: {len(cat_features)}")
print(f"\nDataset ready for training:")
print(f"  Features shape: {X.shape}")
print(f"  Target shape: {y.shape}")
print(f"  Target range: [{y.min():.3f}, {y.max():.3f}]")
print(f"  Target mean: {y.mean():.3f}")

# Cross-validation
print(f"\n=== CROSS-VALIDATION (5-FOLD) ===")
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rows = []
fold_num = 1
feature_importance_accum = np.zeros(len(feature_cols))

for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    test_pool = Pool(X_test, y_test, cat_features=cat_features)

    model = CatBoostRegressor(
        iterations=500,
        depth=8,
        learning_rate=0.05,
        loss_function='RMSE',
        eval_metric='RMSE',
        verbose=False,
        random_state=42
    )
    model.fit(train_pool, eval_set=test_pool)

    preds = model.predict(test_pool)
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds, squared=False)

    cv_rows.append({'fold': fold_num, 'r2': r2, 'mae': mae, 'rmse': rmse})
    feature_importance_accum += model.get_feature_importance(train_pool)
    print(f"Fold {fold_num}: R²={r2:.4f}, MAE={mae:.4f}, RMSE={rmse:.4f}")
    fold_num += 1

cv_df = pd.DataFrame(cv_rows)
cv_summary = cv_df[['r2', 'mae', 'rmse']].mean()
print(f"\nCV Summary:")
print(f"  Mean R²: {cv_summary['r2']:.4f}")
print(f"  Mean MAE: {cv_summary['mae']:.4f}")
print(f"  Mean RMSE: {cv_summary['rmse']:.4f}")

cv_df.to_csv(DATA_OUT / 'catboost_cv_metrics.csv', index=False)
print(f"✓ Saved CV metrics to {DATA_OUT / 'catboost_cv_metrics.csv'}")

# Train final model on full training set
print(f"\n=== TRAINING FINAL MODEL ===")
full_pool = Pool(X, y, cat_features=cat_features)
final_model = CatBoostRegressor(
    iterations=800,
    depth=8,
    learning_rate=0.03,
    loss_function='RMSE',
    eval_metric='RMSE',
    verbose=False,
    random_state=42
)
final_model.fit(full_pool)
print("✓ Final model trained")

# Feature importance
avg_importance = feature_importance_accum / kfold.get_n_splits()
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'cv_importance': avg_importance,
    'full_importance': final_model.get_feature_importance(full_pool)
})
importance_df = importance_df.sort_values('full_importance', ascending=False)
importance_df.to_csv(DATA_OUT / 'catboost_feature_importance.csv', index=False)
print(f"✓ Saved feature importance to {DATA_OUT / 'catboost_feature_importance.csv'}")

print(f"\nTop 10 most important features:")
for idx, row in importance_df.head(10).iterrows():
    print(f"  {row['feature']}: {row['full_importance']:.1f}")

# Predictions on training set (in-sample)
full_preds = final_model.predict(full_pool)
pred_df = pd.DataFrame({
    'grid_id': df_train['grid_id'] if 'grid_id' in df_train.columns else range(len(y)),
    'actual': y,
    'predicted': full_preds,
    'residual': y - full_preds
})
pred_df.to_csv(DATA_OUT / 'catboost_predictions.csv', index=False)
print(f"✓ Saved predictions to {DATA_OUT / 'catboost_predictions.csv'}")

# Predict on full dataset (including cells without poverty data)
print(f"\n=== PREDICTING ON FULL GRID ===")
X_full = df[feature_cols].copy()
X_full = X_full.apply(lambda s: s.fillna(s.mean()) if s.dtype in ['float64', 'int64'] else s)
full_grid_preds = final_model.predict(Pool(X_full, cat_features=cat_features))

full_pred_df = pd.DataFrame({
    'grid_id': df['grid_id'] if 'grid_id' in df.columns else range(len(df)),
    'poverty_rate_predicted': full_grid_preds
})
if TARGET in df.columns:
    full_pred_df['poverty_rate_actual'] = df[TARGET]

full_pred_df.to_csv(DATA_OUT / 'catboost_full_grid_predictions.csv', index=False)
print(f"✓ Predicted poverty for {len(df)} grid cells")
print(f"✓ Saved to {DATA_OUT / 'catboost_full_grid_predictions.csv'}")

# Visualizations
print(f"\n=== GENERATING VISUALIZATIONS ===")
sns.set(style='whitegrid')

# 1. Feature importance plot
plt.figure(figsize=(10, 8))
imp_top = importance_df.head(20)
plt.barh(range(len(imp_top)), imp_top['full_importance'], color='steelblue')
plt.yticks(range(len(imp_top)), imp_top['feature'])
plt.xlabel('Importance Score')
plt.title('CatBoost Feature Importance (Top 20)')
plt.tight_layout()
plt.savefig(VIS_OUT / 'catboost_feature_importance.png', dpi=150, bbox_inches='tight')
plt.close()
print(f"✓ Saved {VIS_OUT / 'catboost_feature_importance.png'}")

# 2. Predicted vs Actual
plt.figure(figsize=(8, 8))
plt.scatter(pred_df['actual'], pred_df['predicted'], alpha=0.6, s=20)
plt.xlabel('Actual Poverty Rate')
plt.ylabel('Predicted Poverty Rate')
plt.title(f'CatBoost: Predicted vs Actual\n(R² = {r2_score(pred_df["actual"], pred_df["predicted"]):.3f})')
lims = [0, max(pred_df['actual'].max(), pred_df['predicted'].max())]
plt.plot(lims, lims, 'r--', alpha=0.7, linewidth=2)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(VIS_OUT / 'catboost_pred_vs_actual.png', dpi=150, bbox_inches='tight')
plt.close()
print(f"✓ Saved {VIS_OUT / 'catboost_pred_vs_actual.png'}")

# 3. Residual distribution
plt.figure(figsize=(10, 6))
sns.histplot(pred_df['residual'], bins=40, kde=True, color='purple')
plt.axvline(0, color='red', linestyle='--', linewidth=2, alpha=0.7)
plt.xlabel('Residual (Actual - Predicted)')
plt.ylabel('Frequency')
plt.title('CatBoost Residual Distribution')
plt.tight_layout()
plt.savefig(VIS_OUT / 'catboost_residuals.png', dpi=150, bbox_inches='tight')
plt.close()
print(f"✓ Saved {VIS_OUT / 'catboost_residuals.png'}")

# 4. CV metrics plot
plt.figure(figsize=(10, 6))
x_pos = np.arange(len(cv_df))
width = 0.25

plt.bar(x_pos - width, cv_df['r2'], width, label='R²', alpha=0.8)
plt.bar(x_pos, cv_df['mae'], width, label='MAE', alpha=0.8)
plt.bar(x_pos + width, cv_df['rmse'], width, label='RMSE', alpha=0.8)

plt.xlabel('Fold')
plt.ylabel('Metric Value')
plt.title('CatBoost Cross-Validation Metrics by Fold')
plt.xticks(x_pos, [f'Fold {i+1}' for i in range(len(cv_df))])
plt.legend()
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig(VIS_OUT / 'catboost_cv_metrics.png', dpi=150, bbox_inches='tight')
plt.close()
print(f"✓ Saved {VIS_OUT / 'catboost_cv_metrics.png'}")

# Copy Random Forest outputs for comparison
print(f"\n=== COPYING RANDOM FOREST OUTPUTS ===")
import shutil

rf_copied = []

# Look for RandomForest output directories
rf_search_dirs = [
    Path('../output/randomForest'),
    Path('../output/rf'),
    Path('../output/random_forest'),
]

for rf_dir in rf_search_dirs:
    if rf_dir.exists():
        print(f"Found RF output directory: {rf_dir}")
        for item in rf_dir.rglob('*'):
            if item.is_file():
                rel_path = item.relative_to(rf_dir)
                dest = RF_OUT / rel_path
                dest.parent.mkdir(parents=True, exist_ok=True)
                try:
                    shutil.copy2(item, dest)
                    rf_copied.append(str(rel_path))
                except Exception as e:
                    print(f"  ⚠ Failed to copy {item}: {e}")
        break

# Create manifest
manifest_path = RF_OUT / 'randomForest_output_list.md'
with open(manifest_path, 'w', encoding='utf-8') as f:
    f.write('# Random Forest Output Files (for Comparison)\n\n')
    f.write(f'Copied from RandomForest model outputs for side-by-side comparison.\n\n')
    if rf_copied:
        f.write(f'## Files Copied ({len(rf_copied)})\n\n')
        for fp in sorted(rf_copied):
            f.write(f'- `{fp}`\n')
    else:
        f.write('⚠ No Random Forest outputs found to copy.\n')
        f.write('\nSearched in:\n')
        for d in rf_search_dirs:
            f.write(f'- {d}\n')

print(f"✓ Saved manifest to {manifest_path}")
if rf_copied:
    print(f"✓ Copied {len(rf_copied)} RF output files")
else:
    print(f"⚠ No RF outputs found (run RandomForest notebook first)")

# Save run metadata
run_meta = {
    'dataset': str(engineered_file),
    'target': TARGET,
    'n_samples': len(df_train),
    'n_features': len(feature_cols),
    'n_categorical': len(cat_features),
    'cv_r2_mean': cv_summary['r2'],
    'cv_mae_mean': cv_summary['mae'],
    'cv_rmse_mean': cv_summary['rmse'],
}
pd.DataFrame([run_meta]).to_csv(DATA_OUT / 'catboost_run_metadata.csv', index=False)
print(f"✓ Saved run metadata to {DATA_OUT / 'catboost_run_metadata.csv'}")

print(f"\n{'='*60}")
print(f"✓ CATBOOST WORKFLOW COMPLETE")
print(f"{'='*60}")
print(f"\nOutputs saved to:")
print(f"  Data: {DATA_OUT}")
print(f"  Visualizations: {VIS_OUT}")
print(f"  RF Comparison: {RF_OUT}")

=== CATBOOST DASYMETRIC POVERTY MAPPING ===

Checking for engineered dataset:
  ..\assets\grid_with_comprehensive_data.csv: ✓ Found
  ..\assets\grid_features_engineered.csv: ✓ Found
  ..\assets\grid_features_spatial_imputed.csv: ✓ Found

✓ Using dataset: ..\assets\grid_with_comprehensive_data.csv
✓ Loaded 1724 rows and 73 columns
✓ Target column: poverty_rate
✓ Training samples (with poverty data): 1724
✓ Feature columns: 67
⚠ Filling 80 missing values with column means
✓ Categorical features detected: 3

Dataset ready for training:
  Features shape: (1724, 67)
  Target shape: (1724,)
  Target range: [0.190, 0.798]
  Target mean: 0.498

=== CROSS-VALIDATION (5-FOLD) ===


TypeError: got an unexpected keyword argument 'squared'

In [None]:
# === MAPPING OUTPUTS ===
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.patches import Patch

print("="*80)
print("CREATING CATBOOST POVERTY MAPS")
print("="*80)

# Prepare data - load grid with geometry
grid_file = engineered_file.with_suffix('.geojson')
if not grid_file.exists():
    # Try alternative paths
    alt_paths = [
        Path('../assets/grid_with_comprehensive_data.geojson'),
        Path('../assets/grid_cells.geojson'),
        Path('../assets/grid_with_poverty_predictions.geojson'),
    ]
    for p in alt_paths:
        if p.exists():
            grid_file = p
            break

if not grid_file.exists():
    print(f"⚠ Grid GeoJSON file not found. Cannot create maps.")
    print(f"  Searched: {grid_file} and alternatives")
    print(f"  Predictions saved to CSV: {DATA_OUT / 'catboost_full_grid_predictions.csv'}")
else:
    import geopandas as gpd
    
    print(f"Loading grid geometry from {grid_file}")
    grid_gdf = gpd.read_file(grid_file)
    
    # Merge with predictions
    if 'grid_id' in full_pred_df.columns and 'grid_id' in grid_gdf.columns:
        grid_gdf = grid_gdf.merge(full_pred_df[['grid_id', 'poverty_rate_predicted']], 
                                 on='grid_id', how='left')
    else:
        # If no grid_id, assume same order
        grid_gdf['poverty_rate_predicted'] = full_pred_df['poverty_rate_predicted'].values
    
    # Create figure with multiple maps
    fig, axes = plt.subplots(2, 2, figsize=(20, 16))
    
    # ============================================================================
    # MAP 1: Predicted Poverty Rate (Grid Level)
    # ============================================================================
    ax1 = axes[0, 0]
    
    grid_gdf.plot(column='poverty_rate_predicted', 
                 ax=ax1, 
                 cmap='RdYlGn_r', 
                 legend=True,
                 vmin=0, vmax=1,
                 edgecolor='gray',
                 linewidth=0.1,
                 legend_kwds={'label': 'Predicted Poverty Rate', 'shrink': 0.8})
    
    ax1.set_title('CatBoost Predicted Poverty Rate\n(Grid Level - 1km²)', 
                 fontsize=14, fontweight='bold')
    ax1.set_xlabel('Longitude')
    ax1.set_ylabel('Latitude')
    ax1.grid(True, alpha=0.2)
    
    # ============================================================================
    # MAP 2: Actual vs Predicted (Training cells only)
    # ============================================================================
    ax2 = axes[0, 1]
    
    if 'poverty_rate' in grid_gdf.columns:
        # Show only cells with actual data
        training_cells = grid_gdf[grid_gdf['poverty_rate'].notna()].copy()
        training_cells['error'] = abs(training_cells['poverty_rate'] - 
                                     training_cells['poverty_rate_predicted'])
        
        training_cells.plot(column='error', 
                          ax=ax2, 
                          cmap='Reds', 
                          legend=True,
                          vmin=0, vmax=0.3,
                          edgecolor='gray',
                          linewidth=0.1,
                          legend_kwds={'label': 'Absolute Error', 'shrink': 0.8})
        
        mean_error = training_cells['error'].mean()
        ax2.set_title(f'Prediction Error (Absolute)\nMean Error: {mean_error:.4f}', 
                     fontsize=14, fontweight='bold')
    else:
        ax2.text(0.5, 0.5, 'No actual data available\nfor error mapping', 
                ha='center', va='center', transform=ax2.transAxes, fontsize=12)
        ax2.set_title('Prediction Error (Absolute)', fontsize=14, fontweight='bold')
    
    ax2.set_xlabel('Longitude')
    ax2.set_ylabel('Latitude')
    ax2.grid(True, alpha=0.2)
    
    # ============================================================================
    # MAP 3: High Poverty Areas (Classification)
    # ============================================================================
    ax3 = axes[1, 0]
    
    # Classify cells
    grid_gdf['poverty_class'] = pd.cut(grid_gdf['poverty_rate_predicted'], 
                                       bins=[0, 0.3, 0.5, 0.7, 1.0],
                                       labels=['Low (<30%)', 'Medium (30-50%)', 
                                              'High (50-70%)', 'Very High (>70%)'])
    
    # Color map
    class_colors = {'Low (<30%)': 'green', 
                   'Medium (30-50%)': 'yellow', 
                   'High (50-70%)': 'orange', 
                   'Very High (>70%)': 'red'}
    
    for class_name, color in class_colors.items():
        subset = grid_gdf[grid_gdf['poverty_class'] == class_name]
        if len(subset) > 0:
            subset.plot(ax=ax3, color=color, edgecolor='gray', linewidth=0.1, alpha=0.7)
    
    # Create legend
    legend_elements = [Patch(facecolor=color, label=label) 
                      for label, color in class_colors.items()]
    ax3.legend(handles=legend_elements, loc='upper right', fontsize=10)
    
    ax3.set_title('CatBoost Poverty Classification\n(Predicted)', 
                 fontsize=14, fontweight='bold')
    ax3.set_xlabel('Longitude')
    ax3.set_ylabel('Latitude')
    ax3.grid(True, alpha=0.2)
    
    # Count cells in each class
    print("\nCatBoost Poverty Classification:")
    for class_name in ['Low (<30%)', 'Medium (30-50%)', 'High (50-70%)', 'Very High (>70%)']:
        count = (grid_gdf['poverty_class'] == class_name).sum()
        pct = count / len(grid_gdf) * 100
        print(f"  {class_name}: {count} cells ({pct:.1f}%)")
    
    # ============================================================================
    # MAP 4: Comparison - Actual vs Predicted (side by side if available)
    # ============================================================================
    ax4 = axes[1, 1]
    
    if 'poverty_rate' in grid_gdf.columns:
        # Show actual poverty rate for training cells
        training_cells = grid_gdf[grid_gdf['poverty_rate'].notna()].copy()
        
        training_cells.plot(column='poverty_rate', 
                          ax=ax4, 
                          cmap='RdYlGn_r', 
                          legend=True,
                          vmin=0, vmax=1,
                          edgecolor='gray',
                          linewidth=0.1,
                          legend_kwds={'label': 'Actual Poverty Rate', 'shrink': 0.8})
        
        ax4.set_title(f'Actual Poverty Rate\n(Training Cells: {len(training_cells)})', 
                     fontsize=14, fontweight='bold')
    else:
        # Show statistics instead
        ax4.text(0.5, 0.5, 'No actual data available', 
                ha='center', va='center', transform=ax4.transAxes, fontsize=12)
        ax4.set_title('Actual Poverty Rate', fontsize=14, fontweight='bold')
    
    ax4.set_xlabel('Longitude')
    ax4.set_ylabel('Latitude')
    ax4.grid(True, alpha=0.2)
    
    plt.tight_layout()
    plt.savefig(VIS_OUT / 'catboost_poverty_maps.png', dpi=300, bbox_inches='tight')
    print(f"\n✓ Saved {VIS_OUT / 'catboost_poverty_maps.png'}")
    plt.show()
    
    # ============================================================================
    # SUMMARY STATISTICS
    # ============================================================================
    print("\n" + "="*80)
    print("CATBOOST SUMMARY STATISTICS")
    print("="*80)
    
    print(f"\nGrid-Level Statistics:")
    print(f"  Total cells: {len(grid_gdf)}")
    if 'poverty_rate' in grid_gdf.columns:
        training_cells = grid_gdf[grid_gdf['poverty_rate'].notna()]
        print(f"  Training cells: {len(training_cells)}")
        print(f"  Mean poverty rate (actual): {training_cells['poverty_rate'].mean():.3f}")
        print(f"  Mean poverty rate (predicted): {training_cells['poverty_rate_predicted'].mean():.3f}")
        print(f"  Std dev (actual): {training_cells['poverty_rate'].std():.3f}")
        print(f"  Std dev (predicted): {training_cells['poverty_rate_predicted'].std():.3f}")
        if 'error' in training_cells.columns:
            print(f"  Mean absolute error: {training_cells['error'].mean():.3f}")
            print(f"  Max error: {training_cells['error'].max():.3f}")
    else:
        print(f"  Mean poverty rate (predicted): {grid_gdf['poverty_rate_predicted'].mean():.3f}")
        print(f"  Std dev (predicted): {grid_gdf['poverty_rate_predicted'].std():.3f}")
    
    print(f"\nPoverty Distribution:")
    print(f"  Min predicted: {grid_gdf['poverty_rate_predicted'].min():.3f}")
    print(f"  Max predicted: {grid_gdf['poverty_rate_predicted'].max():.3f}")
    print(f"  Median predicted: {grid_gdf['poverty_rate_predicted'].median():.3f}")
    
    print("="*80)