# Model Selection - Full Data Training

**Objective:** Train models on 100% of available data instead of holding out 10% for validation.

**Motivation:**
- In time series forecasting, the most recent data is the most important
- Traditional train/val split (90/10) withholds the most relevant data from training
- We should train on ALL available historical data and validate on truly future data

**Approach:**
1. Load master dataset (all 3,962 samples)
2. Use time-series cross-validation for model evaluation
3. Train final model on 100% of data
4. Compare with previous 90/10 split results

## 1. Load Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries loaded successfully!")

In [None]:
# Load master dataset
MASTER_DATASET = Path("../data/master_dataset.parquet")

df = pd.read_parquet(MASTER_DATASET)
df['date'] = pd.to_datetime(df['date'])

print(f"Dataset loaded: {len(df)} samples")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"Features: {len(df.columns) - 2}")  # Exclude date and rv_5d
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

## 2. Load Previous Results for Comparison

Load the results from the 90/10 split experiment to compare performance.

In [None]:
# Load previous ensemble results (90/10 split)
with open('ensemble_final_metrics_2010.json', 'r') as f:
    previous_results = json.load(f)

print("Previous Results (90/10 split):")
print(f"  Training samples: {previous_results.get('train_samples', 'N/A')}")
print(f"  Validation samples: {previous_results.get('val_samples', 'N/A')}")
print(f"  Validation RMSE: {previous_results.get('val_rmse', 'N/A'):.6f}")
print(f"  Validation MAE: {previous_results.get('val_mae', 'N/A'):.6f}")
print(f"  Validation R¬≤: {previous_results.get('val_r2', 'N/A'):.4f}")

## 3. Time-Series Cross-Validation Setup

Instead of a single train/test split, we'll use **TimeSeriesSplit** to evaluate model performance properly:
- Split data into multiple train/test folds moving forward in time
- Each fold trains on past data and tests on future data
- This simulates realistic forecasting scenarios

In [None]:
# Prepare features and target
feature_cols = [col for col in df.columns if col not in ['date', 'rv_5d']]
X = df[feature_cols].values
y = df['rv_5d'].values
dates = df['date'].values

print(f"Features: {len(feature_cols)}")
print(f"Samples: {len(X)}")
print(f"Feature columns: {feature_cols}")

# Setup time-series cross-validation
# Use 5 splits to evaluate model stability
tscv = TimeSeriesSplit(n_splits=5)

print(f"\nTime Series Cross-Validation: {tscv.n_splits} folds")
for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
    train_dates = dates[train_idx]
    test_dates = dates[test_idx]
    print(f"\nFold {i+1}:")
    print(f"  Train: {len(train_idx)} samples, {pd.to_datetime(train_dates[0]).date()} to {pd.to_datetime(train_dates[-1]).date()}")
    print(f"  Test:  {len(test_idx)} samples, {pd.to_datetime(test_dates[0]).date()} to {pd.to_datetime(test_dates[-1]).date()}")

## 4. Evaluate Models with Time-Series CV

Evaluate each base model using time-series cross-validation to see how they perform across different time periods.

In [None]:
# Load best hyperparameters from previous experiments
with open('xgb_best_params_2010.json', 'r') as f:
    xgb_params = json.load(f)

with open('lgbm_best_params_2010.json', 'r') as f:
    lgbm_params = json.load(f)

with open('linear_params_2010.json', 'r') as f:
    ridge_params = json.load(f)

print("Loaded hyperparameters from previous experiments")
print(f"XGBoost params: {xgb_params}")
print(f"LightGBM params: {lgbm_params}")
print(f"Ridge params: {ridge_params}")

In [None]:
def evaluate_model_cv(model, X, y, tscv, model_name):
    """Evaluate model using time-series cross-validation."""
    cv_scores = {'rmse': [], 'mae': [], 'r2': []}
    
    for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        # Train and predict
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        cv_scores['rmse'].append(rmse)
        cv_scores['mae'].append(mae)
        cv_scores['r2'].append(r2)
        
        print(f"  Fold {fold+1}: RMSE={rmse:.6f}, MAE={mae:.6f}, R¬≤={r2:.4f}")
    
    # Calculate mean and std
    results = {
        'rmse_mean': np.mean(cv_scores['rmse']),
        'rmse_std': np.std(cv_scores['rmse']),
        'mae_mean': np.mean(cv_scores['mae']),
        'mae_std': np.std(cv_scores['mae']),
        'r2_mean': np.mean(cv_scores['r2']),
        'r2_std': np.std(cv_scores['r2'])
    }
    
    print(f"\n{model_name} CV Results:")
    print(f"  RMSE: {results['rmse_mean']:.6f} ¬± {results['rmse_std']:.6f}")
    print(f"  MAE:  {results['mae_mean']:.6f} ¬± {results['mae_std']:.6f}")
    print(f"  R¬≤:   {results['r2_mean']:.4f} ¬± {results['r2_std']:.4f}")
    
    return results

print("Evaluation function defined!")

In [None]:
# Evaluate XGBoost
print("="*60)
print("Evaluating XGBoost")
print("="*60)
xgb_model = xgb.XGBRegressor(**xgb_params, random_state=42)
xgb_cv_results = evaluate_model_cv(xgb_model, X, y, tscv, "XGBoost")

In [None]:
# Evaluate LightGBM
print("\n" + "="*60)
print("Evaluating LightGBM")
print("="*60)
lgbm_model = lgb.LGBMRegressor(**lgbm_params, random_state=42, verbose=-1)
lgbm_cv_results = evaluate_model_cv(lgbm_model, X, y, tscv, "LightGBM")

In [None]:
# Evaluate Ridge
print("\n" + "="*60)
print("Evaluating Ridge Regression")
print("="*60)
ridge_model = Ridge(**ridge_params, random_state=42)
ridge_cv_results = evaluate_model_cv(ridge_model, X, y, tscv, "Ridge")

## 5. Compare CV Results with Previous 90/10 Split

Let's see how cross-validation performance compares to the single 90/10 split approach.

In [None]:
# Create comparison dataframe
comparison = pd.DataFrame({
    'Model': ['XGBoost', 'LightGBM', 'Ridge'],
    'CV_RMSE': [
        xgb_cv_results['rmse_mean'],
        lgbm_cv_results['rmse_mean'],
        ridge_cv_results['rmse_mean']
    ],
    'CV_Std': [
        xgb_cv_results['rmse_std'],
        lgbm_cv_results['rmse_std'],
        ridge_cv_results['rmse_std']
    ],
    'Previous_90/10_RMSE': [
        previous_results.get('val_rmse', np.nan),
        previous_results.get('val_rmse', np.nan),
        previous_results.get('val_rmse', np.nan)
    ]
})

print("\n" + "="*60)
print("CROSS-VALIDATION vs 90/10 SPLIT COMPARISON")
print("="*60)
print(comparison.to_string(index=False))
print("\nNote: CV RMSE is averaged across 5 time-series folds")

## 6. Dual Model Strategy: Train Model A (90%) and Model B (100%)

**Strategy:** Train two models and compare them empirically:
- **Model A (90%)**: Conservative model with validation baseline for drift detection
- **Model B (100%)**: Aggressive model using all available data for maximum performance

We'll compare both models on multiple time windows to see which performs better in practice.

In [None]:
# Prepare data splits
train_cutoff = int(len(df) * 0.9)
train_df_90 = df[:train_cutoff]
val_df = df[train_cutoff:]

print("Data Splits:")
print(f"\nModel A Training (90%): {len(train_df_90)} samples")
print(f"  Date range: {train_df_90['date'].min().date()} to {train_df_90['date'].max().date()}")

print(f"\nValidation Set (10%): {len(val_df)} samples")
print(f"  Date range: {val_df['date'].min().date()} to {val_df['date'].max().date()}")

print(f"\nModel B Training (100%): {len(df)} samples")
print(f"  Date range: {df['date'].min().date()} to {df['date'].max().date()}")

# Prepare feature matrices
X_train_90 = train_df_90[feature_cols].values
y_train_90 = train_df_90['rv_5d'].values

X_val = val_df[feature_cols].values
y_val = val_df['rv_5d'].values

X_full = df[feature_cols].values
y_full = df['rv_5d'].values

### 6.1 Train Model A: Ensemble on 90% Data

In [None]:
print("="*60)
print("TRAINING MODEL A (90% DATA)")
print("="*60)

# Train base models on 90% data
xgb_model_a = xgb.XGBRegressor(**xgb_params, random_state=42)
lgbm_model_a = lgb.LGBMRegressor(**lgbm_params, random_state=42, verbose=-1)

print("\nTraining XGBoost (Model A)...")
xgb_model_a.fit(X_train_90, y_train_90)
print("‚úì XGBoost trained")

print("Training LightGBM (Model A)...")
lgbm_model_a.fit(X_train_90, y_train_90)
print("‚úì LightGBM trained")

# Generate base predictions for meta-learner
print("\nGenerating meta-features...")
xgb_pred_train_a = xgb_model_a.predict(X_train_90)
lgbm_pred_train_a = lgbm_model_a.predict(X_train_90)
X_meta_train_a = np.column_stack([xgb_pred_train_a, lgbm_pred_train_a])

# Train meta-learner (Ridge)
print("Training meta-learner (Ridge)...")
meta_model_a = Ridge(**ridge_params, random_state=42)
meta_model_a.fit(X_meta_train_a, y_train_90)
print("‚úì Meta-learner trained")

print("\n‚úì Model A (Ensemble) training complete!")

# Validate on 10% holdout
print("\nValidating Model A on holdout set...")
xgb_pred_val_a = xgb_model_a.predict(X_val)
lgbm_pred_val_a = lgbm_model_a.predict(X_val)
X_meta_val_a = np.column_stack([xgb_pred_val_a, lgbm_pred_val_a])
y_pred_val_a = meta_model_a.predict(X_meta_val_a)

rmse_val_a = np.sqrt(mean_squared_error(y_val, y_pred_val_a))
mae_val_a = mean_absolute_error(y_val, y_pred_val_a)
r2_val_a = r2_score(y_val, y_pred_val_a)

print(f"\nModel A Validation Results:")
print(f"  RMSE: {rmse_val_a:.6f}")
print(f"  MAE:  {mae_val_a:.6f}")
print(f"  R¬≤:   {r2_val_a:.4f}")

### 6.2 Train Model B: Ensemble on 100% Data

In [None]:
print("\n" + "="*60)
print("TRAINING MODEL B (100% DATA)")
print("="*60)

# Train base models on 100% data
xgb_model_b = xgb.XGBRegressor(**xgb_params, random_state=42)
lgbm_model_b = lgb.LGBMRegressor(**lgbm_params, random_state=42, verbose=-1)

print("\nTraining XGBoost (Model B)...")
xgb_model_b.fit(X_full, y_full)
print("‚úì XGBoost trained")

print("Training LightGBM (Model B)...")
lgbm_model_b.fit(X_full, y_full)
print("‚úì LightGBM trained")

# Generate base predictions for meta-learner
print("\nGenerating meta-features...")
xgb_pred_train_b = xgb_model_b.predict(X_full)
lgbm_pred_train_b = lgbm_model_b.predict(X_full)
X_meta_train_b = np.column_stack([xgb_pred_train_b, lgbm_pred_train_b])

# Train meta-learner (Ridge)
print("Training meta-learner (Ridge)...")
meta_model_b = Ridge(**ridge_params, random_state=42)
meta_model_b.fit(X_meta_train_b, y_full)
print("‚úì Meta-learner trained")

print("\n‚úì Model B (Ensemble) training complete!")

# Evaluate on validation set (for comparison with Model A)
print("\nEvaluating Model B on same validation set...")
xgb_pred_val_b = xgb_model_b.predict(X_val)
lgbm_pred_val_b = lgbm_model_b.predict(X_val)
X_meta_val_b = np.column_stack([xgb_pred_val_b, lgbm_pred_val_b])
y_pred_val_b = meta_model_b.predict(X_meta_val_b)

rmse_val_b = np.sqrt(mean_squared_error(y_val, y_pred_val_b))
mae_val_b = mean_absolute_error(y_val, y_pred_val_b)
r2_val_b = r2_score(y_val, y_pred_val_b)

print(f"\nModel B Performance on Validation Set:")
print(f"  RMSE: {rmse_val_b:.6f}")
print(f"  MAE:  {mae_val_b:.6f}")
print(f"  R¬≤:   {r2_val_b:.4f}")

print(f"\nNote: Model B was trained on this data, so these metrics are")
print(f"      in-sample (not a true validation). Use for comparison only.")

### 6.3 Compare Both Models on Multiple Time Windows

Now let's compare how both models perform on different time windows to see which one is actually better in practice.

In [None]:
def predict_ensemble(xgb_model, lgbm_model, meta_model, X):
    """Helper function to make ensemble predictions."""
    xgb_pred = xgb_model.predict(X)
    lgbm_pred = lgbm_model.predict(X)
    X_meta = np.column_stack([xgb_pred, lgbm_pred])
    return meta_model.predict(X_meta)

def compare_models_on_window(window_name, window_df):
    """Compare both models on a specific time window."""
    X_window = window_df[feature_cols].values
    y_true = window_df['rv_5d'].values
    
    # Model A predictions
    y_pred_a = predict_ensemble(xgb_model_a, lgbm_model_a, meta_model_a, X_window)
    rmse_a = np.sqrt(mean_squared_error(y_true, y_pred_a))
    mae_a = mean_absolute_error(y_true, y_pred_a)
    
    # Model B predictions
    y_pred_b = predict_ensemble(xgb_model_b, lgbm_model_b, meta_model_b, X_window)
    rmse_b = np.sqrt(mean_squared_error(y_true, y_pred_b))
    mae_b = mean_absolute_error(y_true, y_pred_b)
    
    # Comparison
    winner = 'B' if rmse_b < rmse_a else 'A'
    improvement = (rmse_a - rmse_b) / rmse_a * 100 if winner == 'B' else (rmse_b - rmse_a) / rmse_b * 100
    
    return {
        'window': window_name,
        'samples': len(window_df),
        'date_start': window_df['date'].min().date(),
        'date_end': window_df['date'].max().date(),
        'model_a_rmse': rmse_a,
        'model_a_mae': mae_a,
        'model_b_rmse': rmse_b,
        'model_b_mae': mae_b,
        'winner': winner,
        'improvement_pct': abs(improvement)
    }

print("Comparison functions defined!")

In [None]:
# Define time windows for comparison
windows = [
    ("Full Validation Set (10%)", val_df),
    ("Last 30 samples", df.tail(30)),
    ("Last 10 samples", df.tail(10)),
    ("Last 1 sample", df.tail(1))
]

# Compare models on each window
print("\n" + "="*80)
print("DUAL MODEL COMPARISON - MULTIPLE TIME WINDOWS")
print("="*80)

results = []
for window_name, window_df in windows:
    result = compare_models_on_window(window_name, window_df)
    results.append(result)
    
    print(f"\n{window_name}:")
    print(f"  Samples: {result['samples']}")
    print(f"  Date Range: {result['date_start']} to {result['date_end']}")
    print(f"  Model A RMSE: {result['model_a_rmse']:.6f}")
    print(f"  Model B RMSE: {result['model_b_rmse']:.6f}")
    print(f"  Winner: Model {result['winner']} (better by {result['improvement_pct']:.2f}%)")

# Create summary dataframe
results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("SUMMARY TABLE")
print("="*80)
print(results_df[['window', 'samples', 'model_a_rmse', 'model_b_rmse', 'winner', 'improvement_pct']].to_string(index=False))

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: RMSE Comparison
window_names = [r['window'] for r in results]
model_a_rmses = [r['model_a_rmse'] for r in results]
model_b_rmses = [r['model_b_rmse'] for r in results]

x = np.arange(len(window_names))
width = 0.35

axes[0].bar(x - width/2, model_a_rmses, width, label='Model A (90%)', alpha=0.8)
axes[0].bar(x + width/2, model_b_rmses, width, label='Model B (100%)', alpha=0.8)
axes[0].set_xlabel('Time Window')
axes[0].set_ylabel('RMSE')
axes[0].set_title('RMSE Comparison Across Time Windows')
axes[0].set_xticks(x)
axes[0].set_xticklabels([w.replace(' ', '\n') for w in window_names], fontsize=8)
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# Plot 2: Win Count
win_counts = results_df['winner'].value_counts()
colors = ['#2ecc71' if w == 'B' else '#3498db' for w in win_counts.index]
axes[1].bar(win_counts.index, win_counts.values, color=colors, alpha=0.8)
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Number of Wins')
axes[1].set_title('Model Performance: Win Count')
axes[1].set_xticks([0, 1])
axes[1].set_xticklabels(['Model A\n(90%)', 'Model B\n(100%)'])
axes[1].grid(axis='y', alpha=0.3)

for i, v in enumerate(win_counts.values):
    axes[1].text(i, v + 0.05, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\nModel B wins: {len([r for r in results if r['winner'] == 'B'])}/4 windows")
print(f"Average improvement when Model B wins: {np.mean([r['improvement_pct'] for r in results if r['winner'] == 'B']):.2f}%")

## 7. Decision Logic and Recommendations

Based on the dual model comparison, let's establish decision rules for production.

In [None]:
# Calculate drift for Model A (using validation baseline)
drift_pct = (results[1]['model_a_rmse'] / rmse_val_a - 1) * 100  # Using last 30 samples

# Decision logic
model_b_win_rate = len([r for r in results if r['winner'] == 'B']) / len(results) * 100

print("="*80)
print("PRODUCTION DECISION LOGIC")
print("="*80)

print(f"\nModel A (90%) Baseline: {rmse_val_a:.6f} RMSE")
print(f"Model A Drift (last 30): {drift_pct:+.1f}%")
print(f"Model B Win Rate: {model_b_win_rate:.0f}% ({len([r for r in results if r['winner'] == 'B'])}/4 windows)")

print("\nDecision Rules:")
print("-" * 80)

# Rule 1: Check drift
if drift_pct > 20:
    print("üî¥ CRITICAL DRIFT DETECTED (>20%)")
    print("   ‚Üí RETRAIN BOTH MODELS IMMEDIATELY")
    recommendation = "RETRAIN"
elif drift_pct > 10:
    print("‚ö†Ô∏è  WARNING: Moderate drift detected (10-20%)")
    print("   ‚Üí Plan retraining within next week")
    recommendation = "PLAN_RETRAIN"
else:
    print("‚úì Drift is acceptable (<10%)")
    recommendation = "OK"

# Rule 2: Model selection
print(f"\nModel Selection:")
if model_b_win_rate >= 75 and drift_pct < 10:
    print("‚úì Use Model B (100% data) for production")
    print(f"  Reason: Wins {model_b_win_rate:.0f}% of windows and drift is low")
    selected_model = "B"
elif model_b_win_rate >= 50:
    print("‚ö†Ô∏è  Mixed results - Consider using Model B but monitor closely")
    selected_model = "B (with caution)"
else:
    print("‚úì Use Model A (90% data) for production")
    print("  Reason: More consistent performance, have validation baseline")
    selected_model = "A"

print(f"\n{'='*80}")
print(f"FINAL RECOMMENDATION: Use Model {selected_model}")
print(f"{'='*80}")

## 8. Save Both Models and Baselines

Save both models and their performance metrics for production use.

In [None]:
import pickle
from datetime import datetime

# Save Model A (90%)
model_a_artifact = {
    'xgb_model': xgb_model_a,
    'lgbm_model': lgbm_model_a,
    'meta_model': meta_model_a,
    'feature_cols': feature_cols
}

with open('model_90pct.pkl', 'wb') as f:
    pickle.dump(model_a_artifact, f)
print("‚úì Model A saved: model_90pct.pkl")

# Save Model B (100%)
model_b_artifact = {
    'xgb_model': xgb_model_b,
    'lgbm_model': lgbm_model_b,
    'meta_model': meta_model_b,
    'feature_cols': feature_cols
}

with open('model_100pct.pkl', 'wb') as f:
    pickle.dump(model_b_artifact, f)
print("‚úì Model B saved: model_100pct.pkl")

# Save dual baseline metrics
dual_baseline = {
    'timestamp': datetime.now().isoformat(),
    'model_a': {
        'training_samples': len(train_df_90),
        'training_date_start': str(train_df_90['date'].min().date()),
        'training_date_end': str(train_df_90['date'].max().date()),
        'validation_rmse': float(rmse_val_a),
        'validation_mae': float(mae_val_a),
        'validation_r2': float(r2_val_a),
        'validation_samples': len(val_df)
    },
    'model_b': {
        'training_samples': len(df),
        'training_date_start': str(df['date'].min().date()),
        'training_date_end': str(df['date'].max().date()),
        'validation_rmse': float(rmse_val_b),
        'validation_mae': float(mae_val_b),
        'validation_r2': float(r2_val_b),
        'note': 'Model B metrics on validation set are in-sample'
    },
    'comparison': {
        'model_b_win_rate': float(model_b_win_rate),
        'recommended_model': selected_model
    }
}

with open('dual_model_baseline.json', 'w') as f:
    json.dump(dual_baseline, f, indent=2)
print("‚úì Dual baseline saved: dual_model_baseline.json")

print("\n" + "="*80)
print("ALL ARTIFACTS SAVED")
print("="*80)
print("Files created:")
print("  - model_90pct.pkl")
print("  - model_100pct.pkl")
print("  - dual_model_baseline.json")