# Linear Regression Model for Hockey Goal Prediction

This notebook implements the LinearRegressionModel and LinearGoalPredictor classes
for predicting hockey game outcomes using linear regression with regularization.

**Model 3** in our prediction pipeline.

## Features
- ElasticNet regularization (L1/L2 mix)
- Polynomial feature expansion
- Feature scaling (Standard/Robust)
- Coefficient analysis and feature importance

In [None]:
# Imports
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## 1. LinearRegressionModel Class

The core class supporting Ridge, Lasso, ElasticNet, and plain OLS regression.

In [None]:
from utils.linear_model import (
    LinearRegressionModel,
    LinearGoalPredictor,
    grid_search_linear,
    random_search_linear,
    compare_regularization
)

# Display class signature
print("LinearRegressionModel Parameters:")
print("- alpha: Regularization strength (0 = no regularization)")
print("- l1_ratio: 0.0 = Ridge (L2), 1.0 = Lasso (L1), 0.5 = ElasticNet")
print("- scaling: 'standard', 'robust', or None")
print("- poly_degree: 1 = linear, 2+ = polynomial")

## 2. Generate Synthetic Data for Demo

In [None]:
# Generate synthetic hockey game data
np.random.seed(42)
n_games = 500

# Features
data = pd.DataFrame({
    'home_elo': np.random.normal(1500, 100, n_games),
    'away_elo': np.random.normal(1500, 100, n_games),
    'home_recent_form': np.random.uniform(0, 1, n_games),
    'away_recent_form': np.random.uniform(0, 1, n_games),
    'home_rest_days': np.random.choice([1, 2, 3, 4, 5], n_games),
    'away_rest_days': np.random.choice([1, 2, 3, 4, 5], n_games),
    'home_avg_goals': np.random.normal(3.0, 0.5, n_games),
    'away_avg_goals': np.random.normal(2.8, 0.5, n_games),
    'home_avg_against': np.random.normal(2.7, 0.5, n_games),
    'away_avg_against': np.random.normal(2.9, 0.5, n_games),
})

# Generate targets with some relationship to features
home_base = (
    0.5 * (data['home_elo'] - data['away_elo']) / 100 +
    0.3 * data['home_recent_form'] +
    0.5 * data['home_avg_goals'] -
    0.2 * data['away_avg_goals'] +
    0.1 * (data['home_rest_days'] - data['away_rest_days'])
)

away_base = (
    0.5 * (data['away_elo'] - data['home_elo']) / 100 +
    0.3 * data['away_recent_form'] +
    0.5 * data['away_avg_goals'] -
    0.2 * data['home_avg_goals'] +
    0.1 * (data['away_rest_days'] - data['home_rest_days'])
)

# Add noise and clip to realistic values
data['home_goals'] = np.maximum(0, np.round(2.8 + home_base + np.random.normal(0, 1, n_games))).astype(int)
data['away_goals'] = np.maximum(0, np.round(2.6 + away_base + np.random.normal(0, 1, n_games))).astype(int)

print(f"Generated {len(data)} games")
print(f"\nSample data:")
data.head()

In [None]:
# Split data
feature_cols = [col for col in data.columns if col not in ['home_goals', 'away_goals']]
X = data[feature_cols]
y_home = data['home_goals']
y_away = data['away_goals']

X_train, X_test, y_home_train, y_home_test = train_test_split(
    X, y_home, test_size=0.2, random_state=42
)
_, _, y_away_train, y_away_test = train_test_split(
    X, y_away, test_size=0.2, random_state=42
)

print(f"Training set: {len(X_train)} games")
print(f"Test set: {len(X_test)} games")

## 3. Train Different Regularization Types

In [None]:
# Ridge Regression (L2)
ridge_model = LinearRegressionModel(
    alpha=1.0,
    l1_ratio=0.0,  # Pure Ridge
    scaling='standard',
    name='ridge_home'
)

ridge_model.fit(X_train, y_home_train)
ridge_metrics = ridge_model.evaluate(X_test, y_home_test)

print("Ridge Regression Results:")
print(f"  RMSE: {ridge_metrics['rmse']:.4f}")
print(f"  MAE:  {ridge_metrics['mae']:.4f}")
print(f"  R²:   {ridge_metrics['r2']:.4f}")

In [None]:
# Lasso Regression (L1)
lasso_model = LinearRegressionModel(
    alpha=0.1,
    l1_ratio=1.0,  # Pure Lasso
    scaling='standard',
    name='lasso_home'
)

lasso_model.fit(X_train, y_home_train)
lasso_metrics = lasso_model.evaluate(X_test, y_home_test)

print("Lasso Regression Results:")
print(f"  RMSE: {lasso_metrics['rmse']:.4f}")
print(f"  MAE:  {lasso_metrics['mae']:.4f}")
print(f"  R²:   {lasso_metrics['r2']:.4f}")

# Show selected features (non-zero coefficients)
print(f"\nFeatures selected by Lasso: {len(lasso_model.get_nonzero_features())}")
print(lasso_model.get_nonzero_features())

In [None]:
# ElasticNet (L1 + L2)
elasticnet_model = LinearRegressionModel(
    alpha=0.1,
    l1_ratio=0.5,  # Equal L1 and L2
    scaling='standard',
    name='elasticnet_home'
)

elasticnet_model.fit(X_train, y_home_train)
elasticnet_metrics = elasticnet_model.evaluate(X_test, y_home_test)

print("ElasticNet Regression Results:")
print(f"  RMSE: {elasticnet_metrics['rmse']:.4f}")
print(f"  MAE:  {elasticnet_metrics['mae']:.4f}")
print(f"  R²:   {elasticnet_metrics['r2']:.4f}")

## 4. Polynomial Features

In [None]:
# Add polynomial features (quadratic terms)
poly_model = LinearRegressionModel(
    alpha=0.1,
    l1_ratio=0.5,
    poly_degree=2,  # Add quadratic terms
    scaling='standard',
    name='poly_elasticnet'
)

poly_model.fit(X_train, y_home_train)
poly_metrics = poly_model.evaluate(X_test, y_home_test)

print("Polynomial (degree=2) ElasticNet Results:")
print(f"  RMSE: {poly_metrics['rmse']:.4f}")
print(f"  MAE:  {poly_metrics['mae']:.4f}")
print(f"  R²:   {poly_metrics['r2']:.4f}")
print(f"\n  Original features: {poly_model.n_features_}")
print(f"  After polynomial expansion: {poly_model.n_features_poly_}")

## 5. Coefficient Analysis

In [None]:
# Get coefficients from the best model
coefs = elasticnet_model.get_coefficients(top_n=15)
print("Top 15 Most Important Coefficients:")
coefs

In [None]:
# Visualize coefficients
fig, ax = plt.subplots(figsize=(10, 6))

coefs_to_plot = elasticnet_model.get_coefficients()
colors = ['green' if c > 0 else 'red' for c in coefs_to_plot['coefficient']]

ax.barh(coefs_to_plot['feature'], coefs_to_plot['coefficient'], color=colors)
ax.set_xlabel('Coefficient Value')
ax.set_ylabel('Feature')
ax.set_title('Linear Regression Coefficients')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

## 6. LinearGoalPredictor (Dual Model)

In [None]:
# Split with full data for predictor
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Create and train the predictor
predictor = LinearGoalPredictor(
    alpha=0.1,
    l1_ratio=0.5,
    scaling='standard',
    poly_degree=1
)

predictor.fit(train_df)
print(predictor)

In [None]:
# Evaluate on test set
metrics = predictor.evaluate(test_df)

print("LinearGoalPredictor Evaluation:")
print(f"\nHome Goals Prediction:")
print(f"  RMSE: {metrics['home']['rmse']:.4f}")
print(f"  MAE:  {metrics['home']['mae']:.4f}")
print(f"  R²:   {metrics['home']['r2']:.4f}")

print(f"\nAway Goals Prediction:")
print(f"  RMSE: {metrics['away']['rmse']:.4f}")
print(f"  MAE:  {metrics['away']['mae']:.4f}")
print(f"  R²:   {metrics['away']['r2']:.4f}")

print(f"\nCombined:")
print(f"  RMSE: {metrics['combined']['rmse']:.4f}")
print(f"  Win Accuracy: {metrics['win_accuracy']:.2%}")

In [None]:
# Predict a single game
sample_game = test_df.iloc[0]
home_pred, away_pred = predictor.predict_goals(sample_game)

print("Single Game Prediction:")
print(f"  Predicted: {home_pred:.1f} - {away_pred:.1f}")
print(f"  Actual:    {sample_game['home_goals']} - {sample_game['away_goals']}")

In [None]:
# Batch predictions
predictions = predictor.predict_batch(test_df)
predictions['home_actual'] = test_df['home_goals'].values
predictions['away_actual'] = test_df['away_goals'].values
predictions.head(10)

## 7. Compare Regularization Types

In [None]:
# Compare Ridge, Lasso, ElasticNet across alpha values
comparison = compare_regularization(
    X_train, y_home_train,
    alphas=[0.001, 0.01, 0.1, 1.0, 10.0],
    cv=5
)

print("Regularization Comparison:")
comparison

In [None]:
# Visualize comparison
fig, ax = plt.subplots(figsize=(10, 6))

for model_name in ['Ridge', 'Lasso', 'ElasticNet']:
    subset = comparison[comparison['model'] == model_name]
    ax.plot(subset['alpha'], subset['rmse_mean'], marker='o', label=model_name)
    ax.fill_between(
        subset['alpha'],
        subset['rmse_mean'] - subset['rmse_std'],
        subset['rmse_mean'] + subset['rmse_std'],
        alpha=0.2
    )

ax.set_xscale('log')
ax.set_xlabel('Alpha (Regularization Strength)')
ax.set_ylabel('RMSE')
ax.set_title('Regularization Comparison')
ax.legend()
plt.tight_layout()
plt.show()

## 8. Cross-Validation

In [None]:
# 5-fold cross-validation
cv_results = elasticnet_model.cross_validate(X, y_home, cv=5)

print("5-Fold Cross-Validation Results:")
print(f"  Mean RMSE: {cv_results['mean']:.4f} ± {cv_results['std']:.4f}")
print(f"  Fold scores: {[f'{s:.4f}' for s in cv_results['scores']]}")

## 9. Save and Load Model

In [None]:
# Save the predictor
predictor.save('../models/saved/linear_predictor')
print("Predictor saved!")

In [None]:
# Load and verify
loaded_predictor = LinearGoalPredictor.load('../models/saved/linear_predictor')
loaded_metrics = loaded_predictor.evaluate(test_df)

print(f"Loaded predictor RMSE: {loaded_metrics['combined']['rmse']:.4f}")
print(f"Original predictor RMSE: {metrics['combined']['rmse']:.4f}")
print(f"Match: {abs(loaded_metrics['combined']['rmse'] - metrics['combined']['rmse']) < 0.0001}")

## Summary

The Linear Regression model provides:
- **Interpretable coefficients** - understand feature contributions
- **Feature selection** - Lasso and ElasticNet can zero out unimportant features
- **Regularization** - prevents overfitting with L1/L2 penalties
- **Polynomial features** - capture non-linear relationships
- **Fast training** - much faster than tree-based models

### When to Use Linear Regression:
- Baseline comparison with more complex models
- When interpretability is important
- Limited training data (less prone to overfitting)
- Feature selection with Lasso/ElasticNet