# 01 - Improved Benchmark Model

This notebook:
- Loads and merges water quality, Landsat, and TerraClimate datasets
- Trains XGBoost models using optimized hyperparameters
- Evaluates performance with RÂ² and RMSE metrics
- Saves trained models

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from data_loading import (
    load_water_quality_data,
    load_landsat_data,
    load_terraclimate_data,
    merge_all_datasets,
    handle_missing_values,
    split_features_target
)
from model_training import (
    train_xgboost_model,
    evaluate_model,
    save_model,
    BEST_XGB_PARAMS,
    get_feature_importance
)
from utils import setup_logging, calculate_metrics, print_metrics

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load Data

In [None]:
# Load water quality data
train_wq, test_wq, submission_template = load_water_quality_data(
    train_path='../data/raw/train.csv',
    test_path='../data/raw/test.csv',
    submission_template_path='../data/raw/submission_template.csv'
)

In [None]:
# Load Landsat data
train_landsat, test_landsat = load_landsat_data(
    train_landsat_path='../data/raw/train_landsat.csv',
    test_landsat_path='../data/raw/test_landsat.csv'
)

In [None]:
# Load TerraClimate data
train_climate, test_climate = load_terraclimate_data(
    train_climate_path='../data/raw/train_terraclimate.csv',
    test_climate_path='../data/raw/test_terraclimate.csv'
)

## 2. Merge Datasets

In [None]:
# Merge training datasets
train_merged = merge_all_datasets(train_wq, train_landsat, train_climate)
print(f"\nTraining data shape: {train_merged.shape}")
print(f"Columns: {list(train_merged.columns)}")

In [None]:
# Merge test datasets
test_merged = merge_all_datasets(test_wq, test_landsat, test_climate)
print(f"\nTest data shape: {test_merged.shape}")

## 3. Handle Missing Values

In [None]:
# Fill missing values with median
train_merged = handle_missing_values(train_merged, strategy='median')
test_merged = handle_missing_values(test_merged, strategy='median')

## 4. Prepare Features and Target

In [None]:
# Define columns to drop
drop_cols = ['uid', 'date'] if 'uid' in train_merged.columns else []
if 'date' in train_merged.columns:
    drop_cols.append('date')

# Split features and target
X_train_full, y_train_full = split_features_target(
    train_merged,
    target_col='target',
    drop_cols=drop_cols
)

X_test, _ = split_features_target(
    test_merged,
    target_col='target',
    drop_cols=drop_cols
)

In [None]:
# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full,
    y_train_full,
    test_size=0.2,
    random_state=42
)

print(f"Train set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")

## 5. Train XGBoost Model with Best Parameters

In [None]:
# Display parameters
print("Training with BEST_XGB_PARAMS:")
for key, value in BEST_XGB_PARAMS.items():
    print(f"  {key}: {value}")

In [None]:
# Train model
model = train_xgboost_model(
    X_train,
    y_train,
    X_val,
    y_val,
    params=BEST_XGB_PARAMS,
    early_stopping_rounds=50,
    verbose=True
)

## 6. Evaluate Model

In [None]:
# Evaluate on training set
train_metrics = evaluate_model(model, X_train, y_train, "Training Set")

In [None]:
# Evaluate on validation set
val_metrics = evaluate_model(model, X_val, y_val, "Validation Set")

## 7. Feature Importance

In [None]:
# Get feature importance
importance_df = get_feature_importance(
    model,
    X_train.columns.tolist(),
    top_n=20
)

print("\nTop 20 Most Important Features:")
print(importance_df)

In [None]:
# Plot feature importance
plt.figure(figsize=(10, 8))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 20 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../outputs/figures/feature_importance_benchmark.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Save Model

In [None]:
# Save the trained model
save_model(model, '../models/xgboost_benchmark.pkl')
print("\nBenchmark model saved successfully!")

## 9. Summary

This notebook established a strong benchmark model using:
- Merged water quality, Landsat, and TerraClimate data
- Optimized XGBoost hyperparameters
- Proper train/validation split

Next steps:
- Engineer additional features (Notebook 02)
- Add geospatial features (Notebook 03)
- Implement full training pipeline (Notebook 04)