# 04 - Training and Validation Pipeline

This notebook implements a complete ML pipeline:
- Loads fully engineered features
- Performs spatial cross-validation with GroupKFold
- Trains final XGBoost models with best parameters
- Optional: Hyperparameter tuning with Optuna
- Evaluates performance and saves models

This is the production-ready training pipeline.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

from data_loading import load_processed_data, split_features_target
from model_training import (
    BEST_XGB_PARAMS,
    train_xgboost_model,
    evaluate_model,
    cross_validate_model,
    spatial_cross_validate,
    hyperparameter_tuning_optuna,
    save_model,
    get_feature_importance
)
from utils import save_json, print_metrics

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load Fully Engineered Data

In [None]:
# Load datasets with all features
train = load_processed_data('../data/processed/train_with_geospatial.parquet')
test = load_processed_data('../data/processed/test_with_geospatial.parquet')

print(f"Training data: {train.shape}")
print(f"Test data: {test.shape}")
print(f"\nFeature count: {train.shape[1]}")

## 2. Prepare Features and Target

In [None]:
# Identify columns to drop
id_cols = ['uid', 'id'] if 'uid' in train.columns or 'id' in train.columns else []
date_cols = ['date', 'timestamp'] if 'date' in train.columns or 'timestamp' in train.columns else []
drop_cols = list(set(id_cols + date_cols))

# Remove columns that don't exist
drop_cols = [col for col in drop_cols if col in train.columns]

print(f"Dropping columns: {drop_cols}")

In [None]:
# Split features and target
X_full, y_full = split_features_target(train, target_col='target', drop_cols=drop_cols)
X_test, _ = split_features_target(test, target_col='target', drop_cols=drop_cols)

print(f"\nFeature matrix: {X_full.shape}")
print(f"Target: {y_full.shape}")
print(f"Test features: {X_test.shape}")

In [None]:
# Ensure test has same columns as train
missing_cols = set(X_full.columns) - set(X_test.columns)
extra_cols = set(X_test.columns) - set(X_full.columns)

if missing_cols:
    print(f"Adding missing columns to test: {missing_cols}")
    for col in missing_cols:
        X_test[col] = 0

if extra_cols:
    print(f"Removing extra columns from test: {extra_cols}")
    X_test = X_test.drop(columns=extra_cols)

# Reorder columns
X_test = X_test[X_full.columns]
print(f"\nAligned test features: {X_test.shape}")

## 3. Train/Validation Split

In [None]:
# Create train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_full,
    y_full,
    test_size=0.2,
    random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")

## 4. K-Fold Cross-Validation

In [None]:
# Perform 5-fold cross-validation
cv_results = cross_validate_model(
    X_full,
    y_full,
    params=BEST_XGB_PARAMS,
    n_splits=5,
    random_state=42
)

# Save CV results
save_json(cv_results, '../outputs/logs/cv_results.json')

## 5. Spatial Cross-Validation

In [None]:
# Use spatial clusters for spatial CV
if 'spatial_cluster' in train.columns:
    spatial_groups = train['spatial_cluster']
    
    spatial_cv_results = spatial_cross_validate(
        X_full,
        y_full,
        spatial_groups=spatial_groups,
        params=BEST_XGB_PARAMS,
        n_splits=5
    )
    
    # Save spatial CV results
    save_json(spatial_cv_results, '../outputs/logs/spatial_cv_results.json')
else:
    print("Spatial cluster column not found. Skipping spatial CV.")

## 6. Optional: Hyperparameter Tuning with Optuna

**Note**: This section is optional and can be skipped if using BEST_XGB_PARAMS.
Uncomment to run hyperparameter optimization.

In [None]:
# # Uncomment to run Optuna hyperparameter tuning
# RUN_OPTUNA = False  # Set to True to enable tuning

# if RUN_OPTUNA:
#     print("Starting Optuna hyperparameter tuning...")
#     tuning_results = hyperparameter_tuning_optuna(
#         X_train,
#         y_train,
#         X_val,
#         y_val,
#         n_trials=50,
#         timeout=3600  # 1 hour
#     )
#     
#     # Save tuned parameters
#     save_json(tuning_results['best_params'], '../outputs/logs/optuna_best_params.json')
#     
#     # Use tuned parameters
#     FINAL_PARAMS = tuning_results['best_params']
#     FINAL_PARAMS['n_estimators'] = 900
#     FINAL_PARAMS['objective'] = 'reg:squarederror'
#     FINAL_PARAMS['tree_method'] = 'hist'
#     FINAL_PARAMS['random_state'] = 42
#     FINAL_PARAMS['n_jobs'] = -1
# else:
#     FINAL_PARAMS = BEST_XGB_PARAMS

# Use best parameters
FINAL_PARAMS = BEST_XGB_PARAMS
print("Using BEST_XGB_PARAMS for final model training")

## 7. Train Final Model

In [None]:
# Train final model on full training set
print("Training final XGBoost model...")
final_model = train_xgboost_model(
    X_train,
    y_train,
    X_val,
    y_val,
    params=FINAL_PARAMS,
    early_stopping_rounds=50,
    verbose=True
)

## 8. Evaluate Final Model

In [None]:
# Evaluate on training set
train_metrics = evaluate_model(final_model, X_train, y_train, "Training Set")

# Evaluate on validation set
val_metrics = evaluate_model(final_model, X_val, y_val, "Validation Set")

# Save metrics
all_metrics = {
    'train': train_metrics,
    'validation': val_metrics,
    'cv': cv_results
}
save_json(all_metrics, '../outputs/logs/final_model_metrics.json')

## 9. Predictions vs Actuals Visualization

In [None]:
# Generate predictions
y_train_pred = final_model.predict(X_train)
y_val_pred = final_model.predict(X_val)

# Plot predictions vs actuals
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Training set
axes[0].scatter(y_train, y_train_pred, alpha=0.3, s=10)
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predicted')
axes[0].set_title(f'Training Set (R²={train_metrics["r2"]:.4f})')
axes[0].grid(True, alpha=0.3)

# Validation set
axes[1].scatter(y_val, y_val_pred, alpha=0.3, s=10)
axes[1].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual')
axes[1].set_ylabel('Predicted')
axes[1].set_title(f'Validation Set (R²={val_metrics["r2"]:.4f})')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/predictions_vs_actuals.png', dpi=300, bbox_inches='tight')
plt.show()

## 10. Feature Importance Analysis

In [None]:
# Get feature importance
importance_df = get_feature_importance(
    final_model,
    X_train.columns.tolist(),
    top_n=30
)

print("\nTop 30 Most Important Features:")
print(importance_df)

In [None]:
# Plot feature importance
plt.figure(figsize=(12, 10))
plt.barh(range(len(importance_df)), importance_df['importance'])
plt.yticks(range(len(importance_df)), importance_df['feature'])
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.title('Top 30 Feature Importances - Final Model')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../outputs/figures/final_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

# Save importance to CSV
importance_df.to_csv('../outputs/logs/feature_importance.csv', index=False)

## 11. Residuals Analysis

In [None]:
# Calculate residuals
residuals = y_val - y_val_pred

# Plot residuals
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Residuals distribution
axes[0].hist(residuals, bins=50, edgecolor='black')
axes[0].set_xlabel('Residual')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Residuals Distribution')
axes[0].axvline(0, color='red', linestyle='--', linewidth=2)

# Residuals vs predicted
axes[1].scatter(y_val_pred, residuals, alpha=0.3, s=10)
axes[1].axhline(0, color='red', linestyle='--', linewidth=2)
axes[1].set_xlabel('Predicted Value')
axes[1].set_ylabel('Residual')
axes[1].set_title('Residuals vs Predicted')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/residuals_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 12. Save Final Model

In [None]:
# Save the final trained model
save_model(final_model, '../models/xgboost_final.pkl')
print("\nFinal model saved successfully!")

## 13. Train Models on Full Dataset (Optional)

Train on 100% of training data for final submission predictions.

In [None]:
# Train on full dataset without validation split
print("Training model on 100% of training data...")

full_model = train_xgboost_model(
    X_full,
    y_full,
    X_val=None,
    y_val=None,
    params=FINAL_PARAMS,
    early_stopping_rounds=None,
    verbose=False
)

# Save full model
save_model(full_model, '../models/xgboost_full.pkl')
print("Full dataset model saved!")

## Summary

This notebook implemented a complete training and validation pipeline:

### Models Trained:
1. **xgboost_final.pkl**: Trained with 80/20 split, validated
2. **xgboost_full.pkl**: Trained on 100% of data for submission

### Validation Strategies:
- Standard K-Fold Cross-Validation
- Spatial GroupKFold Cross-Validation
- Train/Validation holdout split

### Performance:
- Training R²: {train_metrics['r2']:.4f}
- Validation R²: {val_metrics['r2']:.4f}
- CV Mean R²: {cv_results['mean_r2']:.4f}

Next: Generate submission predictions (Notebook 05)