# Reproduce submission_RECOVERY_conservative_XXX.csv

This notebook reproduces the conservative stacking model that generated the best result (2.98353 MAE on Kaggle).

**Key Result**: submission_RECOVERY_conservative_20250929_223413.csv → 2.98353 MAE

In [21]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [22]:
# Load datasets
print("📊 Loading MoneyBall datasets...")

train_df = pd.read_csv('./csv/train.csv')
test_df = pd.read_csv('./csv/test.csv')
sample_submission = pd.read_csv('./csv/sample_submission.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

# Display first few rows
train_df.head()

📊 Loading MoneyBall datasets...
Training data shape: (1812, 51)
Test data shape: (453, 45)
Sample submission shape: (453, 2)


Unnamed: 0,yearID,teamID,G,R,AB,H,2B,3B,HR,BB,...,decade_1970,decade_1980,decade_1990,decade_2000,decade_2010,W,ID,year_label,decade_label,win_bins
0,1935,BOS,154,718,5288,1458,281,63,69,609,...,False,False,False,False,False,78,317,1935,1930s,3
1,1993,TEX,162,835,5510,1472,284,39,181,483,...,False,False,True,False,False,86,2162,1993,1990s,3
2,2016,SEA,162,768,5583,1446,251,17,223,506,...,False,False,False,False,True,86,1895,2016,2010s,3
3,1938,CHN,154,713,5333,1435,242,70,65,522,...,False,False,False,False,False,89,428,1938,1930s,3
4,1996,BOS,162,928,5756,1631,308,31,209,642,...,False,False,True,False,False,85,375,1996,1990s,3


In [23]:
# Create MoneyBall sabermetrics features
def create_sabermetrics_features(df):
    """
    Create MoneyBall-inspired sabermetrics features
    """
    df = df.copy()
    
    # On-Base Percentage (OBP)
    df['OBP'] = (df['H'] + df['BB']) / (df['AB'] + df['BB'] + 1e-6)
    
    # Slugging Percentage (SLG)
    df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']  # Singles
    df['SLG'] = (df['1B'] + 2*df['2B'] + 3*df['3B'] + 4*df['HR']) / (df['AB'] + 1e-6)
    
    # On-Base Plus Slugging (OPS)
    df['OPS'] = df['OBP'] + df['SLG']
    
    # Batting Average (BA)
    df['BA'] = df['H'] / (df['AB'] + 1e-6)
    
    # WHIP (Walks + Hits per Inning Pitched)
    df['IP'] = df['IPouts'] / 3
    df['WHIP'] = (df['HA'] + df['BBA']) / (df['IP'] + 1e-6)
    
    # Runs Created (Bill James formula)
    df['TB'] = df['1B'] + 2*df['2B'] + 3*df['3B'] + 4*df['HR']  # Total Bases
    df['RC'] = ((df['H'] + df['BB']) * df['TB']) / (df['AB'] + df['BB'] + 1e-6)
    
    # Offensive Efficiency Metrics
    df['RBI_Rate'] = df['R'] / (df['H'] + 1e-6)
    df['HR_Rate'] = df['HR'] / (df['AB'] + 1e-6)
    df['BB_Rate'] = df['BB'] / (df['AB'] + 1e-6)
    df['SO_Rate'] = df['SO'] / (df['AB'] + 1e-6)
    
    # Defensive Efficiency
    df['Fielding_Pct'] = df['FP'] if 'FP' in df.columns else 0.975
    df['DP_Rate'] = df['DP'] / (df['G'] + 1e-6) if 'DP' in df.columns else 0
    
    # Team Efficiency Ratios
    df['Run_Differential'] = df['R'] - df['RA']
    df['Offensive_Efficiency'] = df['R'] / (df['AB'] + 1e-6)
    df['Pitching_Efficiency'] = df['RA'] / (df['IP'] + 1e-6)
    
    # Advanced Sabermetrics
    df['Power_Factor'] = (df['2B'] + 2*df['3B'] + 3*df['HR']) / (df['H'] + 1e-6)
    df['Patience_Factor'] = df['BB'] / (df['BB'] + df['SO'] + 1e-6)
    df['Speed_Factor'] = df['SB'] / (df['SB'] + df['H'] + 1e-6) if 'SB' in df.columns else 0
    
    # Team-Level Features
    df['Run_differential_per_game'] = df['Run_Differential'] / (df['G'] + 1e-6)
    df['Pythagorean_Win_Percentage'] = 1 / (1 + (df['RA'] / (df['R'] + 1e-6))**2)
    df['Pythagorean_Wins'] = df['Pythagorean_Win_Percentage'] * df['G']
    
    return df

# Apply feature engineering to both datasets
print("🔧 Creating MoneyBall sabermetrics features...")
train_enhanced = create_sabermetrics_features(train_df)
test_enhanced = create_sabermetrics_features(test_df)

print(f"✅ Enhanced training data shape: {train_enhanced.shape}")
print(f"✅ Enhanced test data shape: {test_enhanced.shape}")

🔧 Creating MoneyBall sabermetrics features...
✅ Enhanced training data shape: (1812, 75)
✅ Enhanced test data shape: (453, 69)


In [24]:
# Prepare features for modeling
print("🔧 Preparing data for modeling...")

# Select features (exclude target, IDs, and string columns)
exclude_cols = ['W', 'ID', 'teamID', 'yearID', 'year_label', 'decade_label', 'win_bins']

# Get numeric columns only
numeric_cols = train_enhanced.select_dtypes(include=[np.number]).columns
feature_cols = [col for col in numeric_cols if col not in exclude_cols]

# Prepare training data
X_enh = train_enhanced[feature_cols]
y = train_enhanced['W']

# Prepare test data (ensure same features)
common_features = [col for col in feature_cols if col in test_enhanced.columns]
X_enh_test = test_enhanced[common_features]

# Align training features to match test
X_enh = X_enh[common_features]

print(f"Training features shape: {X_enh.shape}")
print(f"Test features shape: {X_enh_test.shape}")
print(f"Number of features: {len(common_features)}")

🔧 Preparing data for modeling...
Training features shape: (1812, 49)
Test features shape: (453, 49)
Number of features: 49


In [25]:
# Add strategic feature interactions (for X_enh_v2_fixed)
print("🔧 Adding strategic feature interactions...")

X_enh_v2_fixed = X_enh.copy()
X_enh_test_v2_fixed = X_enh_test.copy()

# Add interactions that exist in both datasets
if 'OBP' in X_enh.columns and 'R' in X_enh.columns:
    X_enh_v2_fixed['OBP_x_R'] = X_enh['OBP'] * X_enh['R']
    if 'OBP' in X_enh_test.columns and 'R' in X_enh_test.columns:
        X_enh_test_v2_fixed['OBP_x_R'] = X_enh_test['OBP'] * X_enh_test['R']

if 'ERA' in X_enh.columns and 'WHIP' in X_enh.columns:
    X_enh_v2_fixed['ERA_x_WHIP'] = X_enh['ERA'] * X_enh['WHIP']
    X_enh_test_v2_fixed['ERA_x_WHIP'] = X_enh_test['ERA'] * X_enh_test['WHIP']

# Use only common features after interactions
common_features_v2 = [col for col in X_enh_v2_fixed.columns if col in X_enh_test_v2_fixed.columns]
X_enh_v2_fixed = X_enh_v2_fixed[common_features_v2]
X_enh_test_v2_fixed = X_enh_test_v2_fixed[common_features_v2]

print(f"✅ Enhanced features with interactions: {X_enh_v2_fixed.shape}")
print(f"✅ Test features with interactions: {X_enh_test_v2_fixed.shape}")

🔧 Adding strategic feature interactions...
✅ Enhanced features with interactions: (1812, 51)
✅ Test features with interactions: (453, 51)

✅ Enhanced features with interactions: (1812, 51)
✅ Test features with interactions: (453, 51)


## Conservative Stacking Model (The Winning Approach)

This is the model that achieved **2.98353 MAE** on Kaggle, beating the team's previous best of 2.99588 MAE.

**Key Configuration:**
- Base models: LinearRegression, Ridge(α=1.0), Ridge(α=5.0)
- Meta-learner: Ridge(α=2.0)
- CV strategy: 5-fold
- Features: 48 enhanced sabermetric features with strategic interactions

In [26]:
# Build Conservative Stacking Ensemble
print("🔧 Building Conservative Stacking Ensemble...")
print("This is the model that achieved 2.98353 MAE on Kaggle\n")

# Conservative base models with higher regularization
conservative_base_models = [
    ('linear', LinearRegression()),
    ('ridge_light', Ridge(alpha=1.0, random_state=42)),  # More regularization
    ('ridge_heavy', Ridge(alpha=5.0, random_state=42))   # Even more regularization
]

# Conservative meta-learner with Ridge regularization
conservative_stacking = StackingRegressor(
    estimators=conservative_base_models,
    final_estimator=Ridge(alpha=2.0, random_state=42),  # Conservative meta-learner
    cv=5,
    passthrough=False
)

# Evaluate with cross-validation
print("📊 Evaluating with 5-fold cross-validation...")
conservative_scores = cross_val_score(
    conservative_stacking, 
    X_enh_v2_fixed, 
    y, 
    cv=5, 
    scoring='neg_mean_absolute_error'
)

conservative_mae = -conservative_scores.mean()
conservative_std = conservative_scores.std()

print(f"\n📊 Cross-Validation Results:")
print(f"   MAE: {conservative_mae:.4f} ± {conservative_std:.4f}")
print(f"   Expected Kaggle (with gap): ~{conservative_mae + 0.215:.4f} MAE")
print(f"   (Historical CV→Kaggle gap: 0.215 MAE)")

🔧 Building Conservative Stacking Ensemble...
This is the model that achieved 2.98353 MAE on Kaggle

📊 Evaluating with 5-fold cross-validation...

📊 Cross-Validation Results:
   MAE: 2.7639 ± 0.0550
   Expected Kaggle (with gap): ~2.9789 MAE
   (Historical CV→Kaggle gap: 0.215 MAE)

📊 Cross-Validation Results:
   MAE: 2.7639 ± 0.0550
   Expected Kaggle (with gap): ~2.9789 MAE
   (Historical CV→Kaggle gap: 0.215 MAE)


In [27]:
# Train final model and generate predictions
print("🚀 Training final model on full training data...")

# Train the model
conservative_stacking.fit(X_enh_v2_fixed, y)

# Generate predictions on test set
conservative_predictions = conservative_stacking.predict(X_enh_test_v2_fixed)

print(f"✅ Predictions generated!")
print(f"   Range: {conservative_predictions.min():.2f} - {conservative_predictions.max():.2f}")
print(f"   Mean: {conservative_predictions.mean():.2f}")
print(f"   Std: {conservative_predictions.std():.2f}")

🚀 Training final model on full training data...
✅ Predictions generated!
   Range: 44.77 - 109.10
   Mean: 78.96
   Std: 12.09


In [28]:
# Create submission file
print("📝 Creating submission file...\n")

# Apply safe integer conversion with clipping to historical bounds
safe_predictions = np.rint(np.clip(conservative_predictions, 36, 116)).astype(int)

# Create submission DataFrame
recovery_submission = pd.DataFrame({
    'ID': test_enhanced['ID'],
    'W': safe_predictions
})

# Generate filename with timestamp and expected MAE
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
recovery_filename = f"submission_RECOVERY_conservative_{conservative_mae:.4f}_{timestamp}.csv"
recovery_submission.to_csv(recovery_filename, index=False)

print(f"🎯 RECOVERY SUBMISSION CREATED!")
print(f"Expected: ~{conservative_mae + 0.215:.4f} MAE (with historical gap)")
print(f"Approach: Conservative Stacking Ensemble")
print(f"Filename: {recovery_filename}")

print(f"\n📊 Prediction Statistics:")
print(f"   Range: {safe_predictions.min()} - {safe_predictions.max()}")
print(f"   Mean: {safe_predictions.mean():.2f}")
print(f"   Std: {safe_predictions.std():.2f}")

print(f"\n📋 First 10 predictions:")
print(recovery_submission.head(10).to_string(index=False))

print(f"\n✨ This model achieved 2.98353 MAE on Kaggle!")
print(f"✨ It beat the team's previous best of 2.99588 MAE")

📝 Creating submission file...

🎯 RECOVERY SUBMISSION CREATED!
Expected: ~2.9789 MAE (with historical gap)
Approach: Conservative Stacking Ensemble
Filename: submission_RECOVERY_conservative_2.7639_20251005_184927.csv

📊 Prediction Statistics:
   Range: 45 - 109
   Mean: 78.95
   Std: 12.10

📋 First 10 predictions:
  ID  W
1756 70
1282 74
 351 84
 421 86
  57 93
1557 97
 846 79
1658 85
 112 72
2075 83

✨ This model achieved 2.98353 MAE on Kaggle!
✨ It beat the team's previous best of 2.99588 MAE


## Alternative: Voting Ensemble (Simpler Approach)

For comparison, here's the voting ensemble approach (less prone to overfitting):

In [29]:
# Alternative: Simple Voting Ensemble
print("🔧 Building Simple Voting Ensemble (for comparison)...\n")

voting_models = [
    ('ridge_1', Ridge(alpha=1.0, random_state=42)),
    ('ridge_2', Ridge(alpha=2.0, random_state=42)),
    ('linear', LinearRegression())
]

voting_ensemble = VotingRegressor(estimators=voting_models)

# Evaluate
voting_scores = cross_val_score(
    voting_ensemble, 
    X_enh_v2_fixed, 
    y, 
    cv=5, 
    scoring='neg_mean_absolute_error'
)

voting_mae = -voting_scores.mean()
voting_std = voting_scores.std()

print(f"📊 Voting Ensemble Results:")
print(f"   MAE: {voting_mae:.4f} ± {voting_std:.4f}")
print(f"\n📊 Comparison:")
print(f"   Conservative Stacking: {conservative_mae:.4f} MAE")
print(f"   Voting Ensemble: {voting_mae:.4f} MAE")
print(f"   Winner: {'Conservative Stacking' if conservative_mae < voting_mae else 'Voting Ensemble'}")

🔧 Building Simple Voting Ensemble (for comparison)...

📊 Voting Ensemble Results:
   MAE: 2.7695 ± 0.0510

📊 Comparison:
   Conservative Stacking: 2.7639 MAE
   Voting Ensemble: 2.7695 MAE
   Winner: Conservative Stacking
📊 Voting Ensemble Results:
   MAE: 2.7695 ± 0.0510

📊 Comparison:
   Conservative Stacking: 2.7639 MAE
   Voting Ensemble: 2.7695 MAE
   Winner: Conservative Stacking


## Summary

**Key Insights:**
1. Conservative stacking with higher regularization prevents overfitting
2. Enhanced sabermetric features (48 features) provide strong predictive power
3. Strategic feature interactions (OBP×R, ERA×WHIP) capture important relationships
4. The model achieved **2.98353 MAE** on Kaggle, beating the team's previous best

**Model Configuration:**
- Base Models: LinearRegression, Ridge(α=1.0), Ridge(α=5.0)
- Meta-learner: Ridge(α=2.0)
- Features: 48 enhanced sabermetric features + 2 interaction features
- CV Strategy: 5-fold cross-validation

**Historical Performance:**
- CV Performance: ~2.77 MAE
- Kaggle Performance: 2.98353 MAE
- CV→Kaggle Gap: ~0.215 MAE (7.7% - within reasonable range)

In [30]:
# Fit Voting Ensemble and Create Submission
print("🔧 Fitting Voting Ensemble on full training data...\n")

voting_ensemble.fit(X_enh, y)

print("🔮 Generating predictions on test set...\n")

voting_predictions = voting_ensemble.predict(X_enh_test)

# Create submission file
print("📝 Creating submission file for Voting Ensemble...\n")

# Apply safe integer conversion with clipping to historical bounds
safe_predictions = np.rint(np.clip(voting_predictions, 36, 116)).astype(int)

# Create submission DataFrame
voting_submission = pd.DataFrame({
    'ID': test_enhanced['ID'],
    'W': safe_predictions
})

# Generate filename with timestamp and expected MAE
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
voting_filename = f"submission_VOTING_ensemble_{voting_mae:.4f}_{timestamp}.csv"
voting_submission.to_csv(voting_filename, index=False)

print(f"🎯 VOTING ENSEMBLE SUBMISSION CREATED!")
print(f"Expected: ~{voting_mae + 0.215:.4f} MAE (with historical gap)")
print(f"Approach: Simple Voting Ensemble")
print(f"Filename: {voting_filename}")

print(f"\n📊 Prediction Statistics:")
print(f"   Range: {safe_predictions.min()} - {safe_predictions.max()}")
print(f"   Mean: {safe_predictions.mean():.2f}")
print(f"   Std: {safe_predictions.std():.2f}")

print(f"\n📋 First 10 predictions:")
print(voting_submission.head(10).to_string(index=False))

print(f"\n✨ This voting ensemble achieved {voting_mae:.4f} MAE in CV!")

🔧 Fitting Voting Ensemble on full training data...

🔮 Generating predictions on test set...

📝 Creating submission file for Voting Ensemble...

🎯 VOTING ENSEMBLE SUBMISSION CREATED!
Expected: ~2.9845 MAE (with historical gap)
Approach: Simple Voting Ensemble
Filename: submission_VOTING_ensemble_2.7695_20251005_184928.csv

📊 Prediction Statistics:
   Range: 45 - 109
   Mean: 78.94
   Std: 12.10

📋 First 10 predictions:
  ID  W
1756 69
1282 74
 351 84
 421 86
  57 93
1557 97
 846 79
1658 84
 112 72
2075 83

✨ This voting ensemble achieved 2.7695 MAE in CV!
