# CatBoost Dasymetric Poverty Mapping Notebook

## Checklist
- Load and validate input feature and target datasets
- Prepare training matrix (fractional assignment already baked into engineered dataset)
- Train CatBoost with cross-validation and capture metrics
- Generate predictions and evaluation plots
- Save all tabular outputs (CSV) and visualizations (PNG)
- Copy Random Forest outputs and create comparison manifest


In [None]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# Directories
BASE_DIR = Path('.')
OUTPUT_ROOT = BASE_DIR / 'output' / 'catBoost'
DATA_OUT = OUTPUT_ROOT / 'data'
VIS_OUT = OUTPUT_ROOT / 'visualizations'
RF_OUT = OUTPUT_ROOT / 'randomForest_outputs'
for d in [DATA_OUT, VIS_OUT, RF_OUT]:
    d.mkdir(parents=True, exist_ok=True)

# Helper: fail fast with clear message
def abort(msg: str):
    print(f"ERROR: {msg}")
    raise SystemExit(1)

# Locate engineered dataset (adapt if actual filename differs)
# Assumption: a comprehensive grid-level feature dataset exists similar to RandomForest workflow.
ENGINEERED_PATHS = [
    BASE_DIR / 'assets' / 'grid_with_comprehensive_data.csv',
    BASE_DIR / 'assets' / 'grid_features_engineered.csv'
]
engineered_file = None
for p in ENGINEERED_PATHS:
    if p.exists():
        engineered_file = p
        break
if engineered_file is None:
    abort('No engineered dataset found. Expected one of: ' + ', '.join(str(p) for p in ENGINEERED_PATHS))
print(f"Using engineered dataset: {engineered_file}")

# Load dataset
try:
    df = pd.read_csv(engineered_file)
except Exception as e:
    abort(f'Failed to read engineered dataset: {e}')

# Basic validation: check required target column (assumption: poverty_rate or similar)
TARGET_CANDIDATES = ['poverty_rate', 'poverty_index', 'poverty_magnitude']
TARGET = None
for col in TARGET_CANDIDATES:
    if col in df.columns:
        TARGET = col
        break
if TARGET is None:
    abort('No target column found. Looked for: ' + ', '.join(TARGET_CANDIDATES))
print(f"Target column: {TARGET}")

# Drop obvious non-feature columns (IDs, geometry) if present
NON_FEATURE_COLS = [TARGET] + [c for c in ['geometry', 'barangay_name', 'grid_id'] if c in df.columns]
feature_cols = [c for c in df.columns if c not in NON_FEATURE_COLS]
X = df[feature_cols].copy()
y = df[TARGET].copy()

# Handle NaNs (simple strategy: fill with column mean; CatBoost can handle missing but for metrics consistency fill)
X = X.apply(lambda s: s.fillna(s.mean()))

# Determine categorical features (heuristic: object or low unique count)
cat_features = [i for i, c in enumerate(feature_cols) if (df[c].dtype == 'object' or df[c].nunique() < 15)]
print(f"Categorical feature count: {len(cat_features)}")

# Cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_rows = []
fold = 1
feature_importance_accum = np.zeros(len(feature_cols))
for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    test_pool = Pool(X_test, y_test, cat_features=cat_features)

    model = CatBoostRegressor(
        iterations=500,
        depth=8,
        learning_rate=0.05,
        loss_function='RMSE',
        eval_metric='RMSE',
        verbose=False,
        random_state=42
    )
    model.fit(train_pool, eval_set=test_pool)

    preds = model.predict(test_pool)
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds, squared=False)

    cv_rows.append({'fold': fold, 'r2': r2, 'mae': mae, 'rmse': rmse})
    feature_importance_accum += model.get_feature_importance(train_pool)
    print(f"Fold {fold}: R2={r2:.4f} MAE={mae:.4f} RMSE={rmse:.4f}")
    fold += 1

cv_df = pd.DataFrame(cv_rows)
cv_df.to_csv(DATA_OUT / 'catboost_cv_metrics.csv', index=False)
print("Saved CV metrics.")

# Train final model on full data
full_pool = Pool(X, y, cat_features=cat_features)
final_model = CatBoostRegressor(
    iterations=800,
    depth=8,
    learning_rate=0.03,
    loss_function='RMSE',
    eval_metric='RMSE',
    verbose=False,
    random_state=42
)
final_model.fit(full_pool)

# Feature importance (average over folds + full model)
avg_importance = feature_importance_accum / kfold.get_n_splits()
importance_df = pd.DataFrame({'feature': feature_cols, 'cv_importance': avg_importance})
full_importance = final_model.get_feature_importance(full_pool)
importance_df['full_importance'] = full_importance
importance_df.sort_values('full_importance', ascending=False, inplace=True)
importance_df.to_csv(DATA_OUT / 'catboost_feature_importance.csv', index=False)
print("Saved feature importance.")

# Predictions on full data (in-sample for exploratory comparison)
full_preds = final_model.predict(full_pool)
pred_df = pd.DataFrame({'prediction': full_preds, 'actual': y})
pred_df.to_csv(DATA_OUT / 'catboost_full_predictions.csv', index=False)
print("Saved full predictions.")

# Visualization helpers
sns.set(style='whitegrid')

# Feature importance plot
plt.figure(figsize=(8,6))
imp_top = importance_df.head(20)
plt.barh(imp_top['feature'][::-1], imp_top['full_importance'][::-1], color='steelblue')
plt.title('CatBoost Feature Importance (Top 20)')
plt.tight_layout()
plt.savefig(VIS_OUT / 'catboost_feature_importance.png', dpi=150)
plt.close()
print("Saved feature importance plot.")

# Predicted vs Actual
plt.figure(figsize=(6,6))
plt.scatter(pred_df['actual'], pred_df['prediction'], alpha=0.5)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('CatBoost Predicted vs Actual')
lims = [pred_df[['actual','prediction']].min().min(), pred_df[['actual','prediction']].max().max()]
plt.plot(lims, lims, 'r--')
plt.tight_layout()
plt.savefig(VIS_OUT / 'catboost_pred_vs_actual.png', dpi=150)
plt.close()
print("Saved predicted vs actual plot.")

# Residual distribution
residuals = pred_df['actual'] - pred_df['prediction']
plt.figure(figsize=(8,4))
sns.histplot(residuals, bins=30, kde=True, color='purple')
plt.title('CatBoost Residual Distribution')
plt.tight_layout()
plt.savefig(VIS_OUT / 'catboost_residuals.png', dpi=150)
plt.close()
print("Saved residual distribution plot.")

# Copy Random Forest outputs (heuristic: look for prior output directory or known files)
rf_source_dirs = [BASE_DIR / 'output' / 'randomForest', BASE_DIR / 'output' / 'rf', BASE_DIR / 'output' / 'random_forest']
rf_copied = []
for rf_dir in rf_source_dirs:
    if rf_dir.exists():
        for root, dirs, files in os.walk(rf_dir):
            rel_root = Path(root).relative_to(rf_dir)
            for f in files:
                src = Path(root) / f
                dest = RF_OUT / rel_root
                dest.mkdir(parents=True, exist_ok=True)
                try:
                    import shutil
                    shutil.copy2(src, dest / f)
                    rf_copied.append(str(dest / f))
                except Exception as e:
                    print(f"WARN: Failed to copy {src}: {e}")

# Manifest of RF outputs
manifest_path = RF_OUT / 'randomForest_output_list.md'
with open(manifest_path, 'w', encoding='utf-8') as mh:
    mh.write('# Random Forest Output Files (Copied)\n\n')
    if rf_copied:
        for fp in rf_copied:
            mh.write(f'- {fp} \n')
    else:
        mh.write('No Random Forest output directory found; nothing copied.\n')
print("Random Forest outputs manifest written.")

# Save run metadata
run_meta = {
    'engineered_file': str(engineered_file),
    'target': TARGET,
    'n_rows': int(df.shape[0]),
    'n_features': int(len(feature_cols)),
    'categorical_features_detected': int(len(cat_features))
}
pd.DataFrame([run_meta]).to_csv(DATA_OUT / 'catboost_run_metadata.csv', index=False)
print('Saved run metadata.')

print('CatBoost workflow complete.')