In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import make_scorer, mean_squared_error
import time
import warnings
warnings.filterwarnings('ignore')

# Random seed
np.random.seed(42)

print("="*80)
print("STEP 4: CROSS-VALIDATION (LEAVE-ONE-GROUP-OUT)")
print("="*80)

STEP 4: CROSS-VALIDATION (LEAVE-ONE-GROUP-OUT)


In [2]:
# 4.1 LOADING AND PREPAREING DATA
print("\n4.1 Loading data...")

demand = pd.read_csv('demand_clean.csv')
plants = pd.read_csv('plants_clean.csv')
costs = pd.read_csv('costs_clean.csv')

# Get feature names
demand_features = [col for col in demand.columns 
                   if col.startswith('DF') and col not in ['DF_region', 'DF_daytype']]
plant_features = [col for col in plants.columns if col.startswith('PF')]
all_features = demand_features + plant_features

print(f"Features: {len(all_features)} ({len(demand_features)} demand + {len(plant_features)} plant)")

# Combine datasets
combined = costs.copy()
combined = combined.merge(demand[['Demand ID'] + demand_features], on='Demand ID')
combined = combined.merge(plants[['Plant ID'] + plant_features], on='Plant ID')

print(f"Combined data: {combined.shape}")

# Prepare X, y, and groups
X = combined[all_features].values
y = combined['Cost_USD_per_MWh'].values
groups = combined['Demand ID'].values

print(f"\nX: {X.shape}")
print(f"y: {y.shape}")
print(f"Groups: {len(np.unique(groups))} unique demands")


4.1 Loading data...
Features: 30 (12 demand + 18 plant)
Combined data: (24000, 33)

X: (24000, 30)
y: (24000,)
Groups: 500 unique demands


In [3]:
# 4.2 DEFINE CUSTOM SCORING FUNCTION
print("\n4.2 Setting up custom scoring...")

def custom_rmse(y_true, y_pred):
    """
    Calculate RMSE (Root Mean Squared Error)
    Lower is better, so we return negative for sklearn
    """
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return -rmse  # Negative because sklearn maximizes scores

# Create scorer
scorer = make_scorer(custom_rmse, greater_is_better=True)
print(" Custom RMSE scorer created")


4.2 Setting up custom scoring...
 Custom RMSE scorer created


In [4]:
# 4.3 LEAVE-ONE-GROUP-OUT CROSS-VALIDATION
print("\n4.3 Performing Leave-One-Group-Out Cross-Validation...")

print("\n" + "="*80)
print("WHAT IS LOGO (LEAVE-ONE-GROUP-OUT)?")
print("="*80)
print(f"\nTotal folds: {len(np.unique(groups))} (one for each demand)")
print("="*80)

# Create model
model = RandomForestRegressor(
    n_estimators=50,       # 50 trees (faster than 100)
    max_depth=15,          # Max depth 15 (faster than 20)
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)

# Create LOGO cross-validator
logo = LeaveOneGroupOut()
n_folds = logo.get_n_splits(X, y, groups)

print(f"\n Starting LOGO CV with {n_folds} folds...")
print("This may take 5-10 minutes (training 500 models)")

# Perform cross-validation manually to show progress
start_time = time.time()
cv_scores = []

for fold_num, (train_idx, test_idx) in enumerate(logo.split(X, y, groups), 1):
    # Split data
    X_train_fold = X[train_idx]
    y_train_fold = y[train_idx]
    X_test_fold = X[test_idx]
    y_test_fold = y[test_idx]
    
    # Train model
    model.fit(X_train_fold, y_train_fold)
    
    # Predict
    y_pred_fold = model.predict(X_test_fold)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test_fold, y_pred_fold))
    cv_scores.append(rmse)
    
    # Show progress every 50 folds
    if fold_num % 50 == 0 or fold_num == n_folds:
        elapsed = time.time() - start_time
        print(f"  Fold {fold_num}/{n_folds} complete... ({elapsed:.1f}s elapsed)")

elapsed_time = time.time() - start_time
cv_scores = np.array(cv_scores)

print(f"\n Cross-validation complete in {elapsed_time:.1f} seconds ({elapsed_time/60:.2f} minutes)")



4.3 Performing Leave-One-Group-Out Cross-Validation...

WHAT IS LOGO (LEAVE-ONE-GROUP-OUT)?

Total folds: 500 (one for each demand)

 Starting LOGO CV with 500 folds...
This may take 5-10 minutes (training 500 models)
  Fold 50/500 complete... (96.7s elapsed)
  Fold 100/500 complete... (193.4s elapsed)
  Fold 150/500 complete... (293.3s elapsed)
  Fold 200/500 complete... (404.1s elapsed)
  Fold 250/500 complete... (501.2s elapsed)
  Fold 300/500 complete... (598.0s elapsed)
  Fold 350/500 complete... (695.2s elapsed)
  Fold 400/500 complete... (791.9s elapsed)
  Fold 450/500 complete... (888.9s elapsed)
  Fold 500/500 complete... (989.7s elapsed)

 Cross-validation complete in 989.7 seconds (16.50 minutes)


In [7]:
# 4.4 ANALYZING RESULTS
print("\n4.4 Cross-Validation Results:")

print("\n" + "="*80)
print("CV STATISTICS")
print("="*80)

print(f"Number of folds: {len(cv_scores)}")
print(f"\nRMSE Statistics:")
print(f"  Mean RMSE: ${cv_scores.mean():.4f}")
print(f"  Std RMSE: ${cv_scores.std():.4f}")
print(f"  Min RMSE: ${cv_scores.min():.4f}")
print(f"  Max RMSE: ${cv_scores.max():.4f}")
print(f"  Median RMSE: ${np.median(cv_scores):.4f}")

# Show first 10 fold results
print(f"\nFirst 10 fold RMSE values:")
for i in range(min(10, len(cv_scores))):
    print(f"  Fold {i+1}: ${cv_scores[i]:.4f}")

print(f"\nLast 10 fold RMSE values:")
for i in range(max(0, len(cv_scores)-10), len(cv_scores)):
    print(f"  Fold {i+1}: ${cv_scores[i]:.4f}")


4.4 Cross-Validation Results:

CV STATISTICS
Number of folds: 500

RMSE Statistics:
  Mean RMSE: $7.2823
  Std RMSE: $7.7262
  Min RMSE: $2.7259
  Max RMSE: $61.6236
  Median RMSE: $4.9489

First 10 fold RMSE values:
  Fold 1: $5.9625
  Fold 2: $5.0999
  Fold 3: $3.5977
  Fold 4: $31.2427
  Fold 5: $6.4000
  Fold 6: $5.1345
  Fold 7: $4.5542
  Fold 8: $5.1029
  Fold 9: $6.3459
  Fold 10: $5.9403

Last 10 fold RMSE values:
  Fold 491: $6.8516
  Fold 492: $3.9447
  Fold 493: $4.1272
  Fold 494: $4.9846
  Fold 495: $4.6634
  Fold 496: $3.2592
  Fold 497: $4.9435
  Fold 498: $6.9312
  Fold 499: $4.2645
  Fold 500: $4.9221


In [8]:
# 4.5 COMPAREING WITH PREVIOUS RESULTS
print("\n4.5 Comparison with previous results:")

try:
    # Load baseline
    baseline = pd.read_csv('baseline_performance.csv')
    best_baseline = baseline['RMSE'].min()
    avg_baseline = baseline['RMSE'].mean()
    
    # Load Step 3 results
    step3 = pd.read_csv('step3_results.csv')
    step3_rmse = step3['custom_rmse'].values[0]
    
    print("\n" + "="*80)
    print("PERFORMANCE COMPARISON")
    print("="*80)
    print(f"Baseline (best single plant): ${best_baseline:.2f}")
    print(f"Baseline (average): ${avg_baseline:.2f}")
    print(f"Step 3 (train/test split): ${step3_rmse:.2f}")
    print(f"Step 4 (LOGO CV): ${cv_scores.mean():.2f}")
    
    # Calculate improvement
    improvement_vs_baseline = ((best_baseline - cv_scores.mean()) / best_baseline) * 100
    
    if improvement_vs_baseline > 0:
        print(f"\n Improvement over baseline: {improvement_vs_baseline:.1f}%")
    else:
        print(f"\n Worse than baseline: {-improvement_vs_baseline:.1f}%")
    
    # Store for later
    baseline_best = best_baseline
    baseline_avg = avg_baseline
    step3_rmse_val = step3_rmse
    
except Exception as e:
    print(f"\n Could not load previous results: {e}")
    baseline_best = np.nan
    baseline_avg = np.nan
    step3_rmse_val = np.nan



4.5 Comparison with previous results:

PERFORMANCE COMPARISON
Baseline (best single plant): $8.53
Baseline (average): $22.55
Step 3 (train/test split): $4.84
Step 4 (LOGO CV): $7.28

 Improvement over baseline: 14.6%


In [9]:
# 4.6 INTERPRETATION
print("\n" + "="*80)
print("WHAT DO THESE RESULTS MEAN?")
print("="*80)

print(f"\nCross-Validation RMSE: ${cv_scores.mean():.2f} ± ${cv_scores.std():.2f}")

print("\nInterpretation:")
print(f"  • On average, model makes ${cv_scores.mean():.2f} error per MWh")
print(f"  • Standard deviation of ${cv_scores.std():.2f} shows consistency")
print(f"  • LOGO ensures no data leakage (each demand tested separately)")
print(f"  • Model tested on ALL {len(cv_scores)} demands")

if not np.isnan(baseline_best):
    if cv_scores.mean() < baseline_best:
        print(f"\n Model performs better than best baseline plant!")
        print(f"  Savings: ${best_baseline - cv_scores.mean():.2f} per MWh")
    else:
        print(f"\n Model needs improvement")
        print(f"  Gap: ${cv_scores.mean() - best_baseline:.2f} per MWh")



WHAT DO THESE RESULTS MEAN?

Cross-Validation RMSE: $7.28 ± $7.73

Interpretation:
  • On average, model makes $7.28 error per MWh
  • Standard deviation of $7.73 shows consistency
  • LOGO ensures no data leakage (each demand tested separately)
  • Model tested on ALL 500 demands

 Model performs better than best baseline plant!
  Savings: $1.25 per MWh


In [10]:
# 4.7 SAVE RESULTS
print("\n4.7 Saving results...")

# Save individual fold results
cv_results = pd.DataFrame({
    'Fold': range(1, len(cv_scores) + 1),
    'RMSE': cv_scores
})
cv_results.to_csv('step4_cv_results.csv', index=False)
print(" Saved: step4_cv_results.csv")

# Save summary
summary = {
    'cv_mean_rmse': [cv_scores.mean()],
    'cv_std_rmse': [cv_scores.std()],
    'cv_min_rmse': [cv_scores.min()],
    'cv_max_rmse': [cv_scores.max()],
    'cv_median_rmse': [np.median(cv_scores)],
    'n_folds': [len(cv_scores)],
    'execution_time_seconds': [elapsed_time],
    'cv_method': ['LeaveOneGroupOut']
}

if not np.isnan(baseline_best):
    summary['baseline_best'] = [baseline_best]
    summary['baseline_avg'] = [baseline_avg]
    summary['step3_rmse'] = [step3_rmse_val]

summary_df = pd.DataFrame(summary)
summary_df.to_csv('step4_summary.csv', index=False)
print(" Saved: step4_summary.csv")


4.7 Saving results...
 Saved: step4_cv_results.csv
 Saved: step4_summary.csv


In [11]:
print("\n" + "="*80)
print("STEP 4 SUMMARY")
print("="*80)

print(f"\nCross-Validation Method: Leave-One-Group-Out (LOGO)")
print(f"Number of folds: {len(cv_scores)}")
print(f"Execution time: {elapsed_time:.1f} seconds ({elapsed_time/60:.2f} minutes)")

print(f"\nPerformance:")
print(f"  Mean RMSE: ${cv_scores.mean():.2f}")
print(f"  Std RMSE: ${cv_scores.std():.2f}")

if not np.isnan(baseline_best):
    print(f"\nComparison:")
    print(f"  Best Baseline: ${baseline_best:.2f}")
    print(f"  Our Model: ${cv_scores.mean():.2f}")
    if cv_scores.mean() < baseline_best:
        improvement = ((baseline_best - cv_scores.mean()) / baseline_best) * 100
        print(f"   Improvement: {improvement:.1f}%")

print(f"\nFiles Created:")
print(f"  - step4_cv_results.csv (all {len(cv_scores)} fold results)")
print(f"  - step4_summary.csv (summary statistics)")

print("\n" + "="*80)
print("STEP 4 COMPLETE ")
print("="*80)



STEP 4 SUMMARY

Cross-Validation Method: Leave-One-Group-Out (LOGO)
Number of folds: 500
Execution time: 1090.5 seconds (18.17 minutes)

Performance:
  Mean RMSE: $7.28
  Std RMSE: $7.73

Comparison:
  Best Baseline: $8.53
  Our Model: $7.28
   Improvement: 14.6%

Files Created:
  - step4_cv_results.csv (all 500 fold results)
  - step4_summary.csv (summary statistics)

STEP 4 COMPLETE 
