# Polynomial Regression and Overfitting

This notebook explores polynomial regression and the bias-variance tradeoff. We'll visualize how model complexity affects overfitting and underfitting, and learn to identify the optimal model complexity.

Topics covered:
1. Creating synthetic non-linear data
2. Implementing and fitting polynomial regression models
3. Visualizing underfitting, good fit, and overfitting
4. Analyzing the bias-variance tradeoff
5. Finding the optimal polynomial degree

In [43]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Add the parent directory to sys.path to import our custom modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import our implementations
from models.polynomial_regression import PolynomialRegression
from utils.plotting import plot_regression_results, plot_overfitting_curve
from datasets.data_utils import generate_synthetic_data, train_test_split

# Set random seed for reproducibility
np.random.seed(42)

## 1. Generating Non-Linear Data

Let's create synthetic data with a non-linear relationship to demonstrate polynomial regression.

In [None]:
def generate_nonlinear_data(n_samples=100, function_type='polynomial', noise=0.5, x_range=(-3, 3)):
    """Generate synthetic non-linear data."""
    # Generate x values within the specified range
    X = np.linspace(x_range[0], x_range[1], n_samples)
    
    # Generate y based on the function type
    if function_type == 'polynomial':
        # y = 0.5*x^3 - 2*x^2 + 1.5*x + 2 + noise
        y = 0.5 * X**3 - 2 * X**2 + 1.5 * X + 2
    elif function_type == 'sine':
        # y = sin(x) + 0.5*x + noise
        y = np.sin(2 * X) + 0.5 * X
    elif function_type == 'exponential':
        # y = e^(0.5*x) + noise
        y = np.exp(0.5 * X)
        # Clip extreme values
        y = np.clip(y, 0, 100)  
    else:
        raise ValueError(f"Unknown function type: {function_type}")
    
    # Add noise
    y += np.random.randn(n_samples) * noise
    
    return X, y

# Generate data for a polynomial function
X_poly, y_poly = generate_nonlinear_data(n_samples=100, function_type='polynomial', noise=1.0)

# Generate data for a sine function
X_sine, y_sine = generate_nonlinear_data(n_samples=100, function_type='sine', noise=0.3)

# Plot the data
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
plt.scatter(X_poly, y_poly, alpha=0.7)
plt.title('Polynomial Function Data')
plt.xlabel('X')
plt.ylabel('y')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.scatter(X_sine, y_sine, alpha=0.7)
plt.title('Sine Function Data')
plt.xlabel('X')
plt.ylabel('y')
plt.grid(True)

plt.tight_layout()
plt.show()

## 2. Polynomial Regression with Different Degrees

Let's fit polynomial regression models of different degrees to see how the fit changes.

In [None]:
# First, check if input data contains NaNs
import numpy as np

# Check and clean input data
print(f"NaNs in X_poly: {np.isnan(X_poly).any()}")
print(f"NaNs in y_poly: {np.isnan(y_poly).any()}")

# Clean data if needed
X_poly = np.nan_to_num(X_poly)
y_poly = np.nan_to_num(y_poly)

# Split the polynomial data into training and validation sets
X_train_poly, X_val_poly, y_train_poly, y_val_poly = train_test_split(X_poly, y_poly, test_size=0.3, random_state=42)

# Reshape X for our model (expects 2D input)
X_train_poly = X_train_poly.reshape(-1, 1)
X_val_poly = X_val_poly.reshape(-1, 1)

# Train polynomial regression models with different degrees
degrees = [1, 2, 3]
models_poly = []
train_mse_poly = []
val_mse_poly = []

plt.figure(figsize=(15, 12))

for i, degree in enumerate(degrees):
    # Create and train model
    model = PolynomialRegression(degree=degree, learning_rate=0.01, max_iterations=5000)
    model.fit(X_train_poly, y_train_poly)
    models_poly.append(model)
    
    # Compute MSE on training and validation sets
    y_train_pred = model.predict(X_train_poly)
    y_val_pred = model.predict(X_val_poly)
    
    # Check for NaNs in predictions
    if np.isnan(y_train_pred).any() or np.isnan(y_val_pred).any():
        print(f"NaN values detected in predictions for degree {degree}")
        # Replace NaNs with zeros or some other value for visualization
        y_train_pred = np.nan_to_num(y_train_pred)
        y_val_pred = np.nan_to_num(y_val_pred)
    
    train_mse = mean_squared_error(y_train_poly, y_train_pred)
    val_mse = mean_squared_error(y_val_poly, y_val_pred)
    
    train_mse_poly.append(train_mse)
    val_mse_poly.append(val_mse)
    
    # Plot results
    plt.subplot(2, 2, i+1)
    
    # Plot training data
    plt.scatter(X_train_poly, y_train_poly, color='blue', alpha=0.6, label='Training data')
    
    # Plot validation data
    plt.scatter(X_val_poly, y_val_poly, color='green', alpha=0.6, label='Validation data')
    
    # Plot the model predictions on a smooth curve
    X_plot = np.linspace(min(X_poly), max(X_poly), 100).reshape(-1, 1)
    y_plot = model.predict(X_plot)
    
    # Check for NaNs in plot predictions
    if np.isnan(y_plot).any():
        y_plot = np.nan_to_num(y_plot)
    
    plt.plot(X_plot, y_plot, 'r-', linewidth=2, label=f'Degree {degree} model')
    
    plt.title(f'Polynomial Regression (Degree {degree})\nTrain MSE: {train_mse:.2f}, Val MSE: {val_mse:.2f}')
    plt.xlabel('X')
    plt.ylabel('y')
    plt.legend()
    plt.grid(True)

plt.tight_layout()
plt.show()

## 3. Visualizing the Bias-Variance Tradeoff

Let's plot the training and validation errors as a function of model complexity to visualize the bias-variance tradeoff.

In [None]:
# Try more degrees to better visualize the trend
degrees_extended = list(range(1, 3))
train_mse_extended = []
val_mse_extended = []

for degree in degrees_extended:
    model = PolynomialRegression(degree=degree, learning_rate=0.01, max_iterations=5000)
    model.fit(X_train_poly, y_train_poly)
    
    y_train_pred = model.predict(X_train_poly)
    y_val_pred = model.predict(X_val_poly)
    
    train_mse = mean_squared_error(y_train_poly, y_train_pred)
    val_mse = mean_squared_error(y_val_poly, y_val_pred)
    
    train_mse_extended.append(train_mse)
    val_mse_extended.append(val_mse)

# Plot the error curves
plt.figure(figsize=(12, 6))

plt.plot(degrees_extended, train_mse_extended, 'bo-', linewidth=2, label='Training MSE')
plt.plot(degrees_extended, val_mse_extended, 'ro-', linewidth=2, label='Validation MSE')

# Identify the optimal degree (lowest validation error)
optimal_degree = degrees_extended[np.argmin(val_mse_extended)]
min_val_mse = min(val_mse_extended)

# Highlight the optimal degree
plt.axvline(x=optimal_degree, color='green', linestyle='--', label=f'Optimal degree: {optimal_degree}')
plt.scatter([optimal_degree], [min_val_mse], s=100, color='green')

# Annotate regions
plt.annotate('Underfitting\n(High Bias)', xy=(2, max(train_mse_extended[:5]) * 0.8), 
             xytext=(2, max(train_mse_extended[:5]) * 0.8), fontsize=12, ha='center')

plt.annotate('Overfitting\n(High Variance)', xy=(15, min(val_mse_extended) * 2), 
             xytext=(15, min(val_mse_extended) * 2), fontsize=12, ha='center')

plt.annotate('Optimal\nModel Complexity', xy=(optimal_degree, min_val_mse * 0.5),
             xytext=(optimal_degree, min_val_mse * 0.5), fontsize=12, ha='center')

plt.title('Bias-Variance Tradeoff for Polynomial Regression')
plt.xlabel('Polynomial Degree (Model Complexity)')
plt.ylabel('Mean Squared Error')
plt.xticks(degrees_extended)
plt.ylim(0, max(val_mse_extended) * 1.2)
plt.legend()
plt.grid(True)
plt.show()

print(f"Optimal polynomial degree: {optimal_degree}")
print(f"Training MSE at optimal degree: {train_mse_extended[optimal_degree-1]:.2f}")
print(f"Validation MSE at optimal degree: {val_mse_extended[optimal_degree-1]:.2f}")

## 4. Training Curve Analysis

Let's look at how the training progresses for different model complexities.

In [None]:
# Train models and track the learning curves
degrees_for_curves = [1, optimal_degree, 3]  # Underfitting, optimal, overfitting
models_curves = []

for degree in degrees_for_curves:
    model = PolynomialRegression(degree=degree, learning_rate=0.01, max_iterations=5000, store_history=True)
    model.fit(X_train_poly, y_train_poly)
    models_curves.append(model)

# Plot the learning curves
plt.figure(figsize=(12, 6))

for i, (model, degree) in enumerate(zip(models_curves, degrees_for_curves)):
    label = 'Underfitting (Linear)' if degree == 1 else \
            'Optimal Fit' if degree == optimal_degree else \
            'Overfitting (High Degree)'
    plt.plot(model.cost_history, label=f'{label} (Degree {degree})')

plt.title('Learning Curves for Different Model Complexities')
plt.xlabel('Iterations')
plt.ylabel('Cost (MSE)')
plt.legend()
plt.grid(True)
plt.ylim(0, 50)  # Adjust as needed
plt.show()

## 5. Learning Curves with Training Set Size

Now let's examine how model performance changes with training set size for different polynomial degrees.

In [None]:
# Generate more data for this experiment
X_more, y_more = generate_nonlinear_data(n_samples=300, function_type='polynomial', noise=1.0)
X_more = X_more.reshape(-1, 1)  # Reshape for our model

# Split into training and validation
X_train_more, X_val_more, y_train_more, y_val_more = train_test_split(X_more, y_more, test_size=0.3, random_state=42)

# Try different training set sizes
train_sizes = np.linspace(0.1, 1.0, 10)  # From 10% to 100% of training data
degrees_learning = [1, optimal_degree, 3]  # Linear, optimal, high-degree

# Store results
train_mse_by_size = {degree: [] for degree in degrees_learning}
val_mse_by_size = {degree: [] for degree in degrees_learning}

for size in train_sizes:
    # Create a subset of the training data
    n_samples = int(len(X_train_more) * size)
    X_train_subset = X_train_more[:n_samples]
    y_train_subset = y_train_more[:n_samples]
    
    for degree in degrees_learning:
        # Train model on subset
        model = PolynomialRegression(degree=degree, learning_rate=0.01, max_iterations=5000)
        model.fit(X_train_subset, y_train_subset)
        
        # Evaluate on training subset and validation set
        train_pred = model.predict(X_train_subset)
        val_pred = model.predict(X_val_more)
        
        train_mse = mean_squared_error(y_train_subset, train_pred)
        val_mse = mean_squared_error(y_val_more, val_pred)
        
        train_mse_by_size[degree].append(train_mse)
        val_mse_by_size[degree].append(val_mse)

# Plot learning curves
plt.figure(figsize=(18, 6))

for i, degree in enumerate(degrees_learning):
    plt.subplot(1, 3, i+1)
    
    plt.plot(train_sizes * 100, train_mse_by_size[degree], 'b-o', label='Training error')
    plt.plot(train_sizes * 100, val_mse_by_size[degree], 'r-o', label='Validation error')
    
    plt.title(f'Learning Curves for Degree {degree}')
    plt.xlabel('Training Set Size (%)')
    plt.ylabel('Mean Squared Error')
    plt.legend()
    plt.grid(True)
    plt.ylim(0, 50)  # Adjust as needed

plt.tight_layout()
plt.show()

## 6. Decision Boundaries and Model Predictions

Let's visualize the decision boundaries and prediction confidence for different model complexities.

In [None]:
# Create models with different complexities
models_final = []
degrees_final = [1, optimal_degree, 3]  # Linear, optimal, high-degree
descriptions = ['Underfit (Linear)', 'Optimal Fit', 'Overfit (High Degree)']

for degree in degrees_final:
    model = PolynomialRegression(degree=degree, learning_rate=0.01, max_iterations=5000)
    model.fit(X_train_poly, y_train_poly)
    models_final.append(model)

# Create a smooth curve for plotting
X_smooth = np.linspace(min(X_poly) - 0.5, max(X_poly) + 0.5, 1000).reshape(-1, 1)

plt.figure(figsize=(15, 6))

for i, (model, degree, desc) in enumerate(zip(models_final, degrees_final, descriptions)):
    plt.subplot(1, 3, i+1)
    
    # Plot data points
    plt.scatter(X_train_poly, y_train_poly, color='blue', alpha=0.5, label='Training data')
    plt.scatter(X_val_poly, y_val_poly, color='green', alpha=0.5, label='Validation data')
    
    # Plot model prediction
    y_smooth = model.predict(X_smooth)
    plt.plot(X_smooth, y_smooth, 'r-', linewidth=2, label=f'Degree {degree} model')
    
    # Calculate MSE
    train_pred = model.predict(X_train_poly)
    val_pred = model.predict(X_val_poly)
    train_mse = mean_squared_error(y_train_poly, train_pred)
    val_mse = mean_squared_error(y_val_poly, val_pred)
    
    plt.title(f'{desc} (Degree {degree})\nTrain MSE: {train_mse:.2f}, Val MSE: {val_mse:.2f}')
    plt.xlabel('X')
    plt.ylabel('y')
    plt.legend()
    plt.grid(True)

plt.tight_layout()
plt.show()

## 7. Different Non-Linear Function: Sine Wave

Let's repeat our analysis with a sine wave function to see how polynomial regression handles different types of non-linearity.

In [None]:
# Reshape the sine wave data
X_sine = X_sine.reshape(-1, 1)

# Split the sine data into training and validation
X_train_sine, X_val_sine, y_train_sine, y_val_sine = train_test_split(X_sine, y_sine, test_size=0.3, random_state=42)

# Train models with different polynomial degrees
degrees_sine = [1, 2, 3]
train_mse_sine = []
val_mse_sine = []

for degree in degrees_sine:
    model = PolynomialRegression(degree=degree, learning_rate=0.01, max_iterations=5000)
    model.fit(X_train_sine, y_train_sine)
    
    train_pred = model.predict(X_train_sine)
    val_pred = model.predict(X_val_sine)
    
    train_mse = mean_squared_error(y_train_sine, train_pred)
    val_mse = mean_squared_error(y_val_sine, val_pred)
    
    train_mse_sine.append(train_mse)
    val_mse_sine.append(val_mse)

# Plot MSE vs. polynomial degree
plt.figure(figsize=(12, 6))

plt.plot(degrees_sine, train_mse_sine, 'bo-', linewidth=2, label='Training MSE')
plt.plot(degrees_sine, val_mse_sine, 'ro-', linewidth=2, label='Validation MSE')

# Find the optimal degree
optimal_degree_sine = degrees_sine[np.argmin(val_mse_sine)]
plt.axvline(x=optimal_degree_sine, color='green', linestyle='--', label=f'Optimal degree: {optimal_degree_sine}')

plt.title('Bias-Variance Tradeoff for Sine Wave Data')
plt.xlabel('Polynomial Degree')
plt.ylabel('Mean Squared Error')
plt.xticks(degrees_sine)
plt.legend()
plt.grid(True)
plt.show()

# Plot the best fit
best_model_sine = PolynomialRegression(degree=optimal_degree_sine, learning_rate=0.01, max_iterations=5000)
best_model_sine.fit(X_train_sine, y_train_sine)

# Create a smooth curve for plotting
X_smooth_sine = np.linspace(min(X_sine) - 0.5, max(X_sine) + 0.5, 1000).reshape(-1, 1)
y_smooth_sine = best_model_sine.predict(X_smooth_sine)

plt.figure(figsize=(10, 6))
plt.scatter(X_train_sine, y_train_sine, color='blue', alpha=0.6, label='Training data')
plt.scatter(X_val_sine, y_val_sine, color='green', alpha=0.6, label='Validation data')
plt.plot(X_smooth_sine, y_smooth_sine, 'r-', linewidth=2, label=f'Degree {optimal_degree_sine} model')

plt.title(f'Best Polynomial Fit for Sine Wave (Degree {optimal_degree_sine})')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.grid(True)
plt.show()

## 8. Summary of Overfitting and Underfitting

Let's summarize what we've learned about the bias-variance tradeoff, underfitting, and overfitting.

### Key Observations

1. **Underfitting (High Bias)**
   - Occurs when the model is too simple to capture the underlying pattern in the data
   - Symptoms: high training error, high validation error
   - Example: Using a linear model (degree 1) to fit a cubic function
   - Adding more training data doesn't help much

2. **Overfitting (High Variance)**
   - Occurs when the model is too complex and captures noise in the training data
   - Symptoms: low training error, high validation error, large gap between training and validation error
   - Example: Using a very high-degree polynomial to fit a simpler function
   - Adding more training data can help

3. **Optimal Model Complexity**
   - Balances underfitting and overfitting
   - Symptoms: moderate training error, lowest validation error
   - Example: Using a degree-3 polynomial to fit a cubic function
   - Generalizes well to new data

4. **Training Set Size Effects**
   - Complex models require more training data to avoid overfitting
   - Simple models may underfit even with large training sets
   - Learning curves for optimal models show training and validation errors converging as training set size increases

5. **Different Function Types**
   - The optimal polynomial degree depends on the underlying function
   - Even for functions that aren't polynomials (like sine waves), polynomial regression can approximate them well with the right degree
   - More complex functions generally require higher-degree polynomials

### Practical Guidelines

1. **Start Simple**
   - Begin with a simple model and gradually increase complexity
   - Compare performance on validation data at each step

2. **Use Validation Sets**
   - Always evaluate models on data they haven't seen during training
   - Choose the model with the best validation performance, not training performance

3. **Plot Learning Curves**
   - Diagnose underfitting/overfitting by examining how errors change with training set size
   - If training and validation errors are high and close together → underfitting
   - If training error is low but validation error is high → overfitting

4. **Collect More Data**
   - If possible, collect more training data, especially when dealing with complex models
   - More data helps complex models generalize better

5. **Consider Regularization**
   - Instead of reducing model complexity, regularization can help prevent overfitting while maintaining flexibility
   - We'll explore this in the next notebook