# Chapter 6. Advanced Regression
## Author: Rahul Bhadani

# 1. Kernel Functions

In [None]:
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Serif'
plt.rcParams['font.size'] = 15
from scipy.stats import norm

def tricube_kernel(x):
    """Tricube kernel function scaled to max 1."""
    abs_x = np.abs(x)
    return np.where(abs_x < 1, ((70 / 81) * (1 - abs_x**3) ** 3) / (70 / 81), 0)

def rectangular_kernel(x):
    """Rectangular (uniform) kernel function scaled to max 1."""
    return np.where(np.abs(x) < 1, 1, 0)

def normal_kernel(x):
    """Normal (Gaussian) kernel function, rescaled to max 1."""
    return norm.pdf(x) / norm.pdf(0)

# Generate x values
x = np.linspace(-4.5, 4.5, 400)

# Compute kernel values
y_tricube = tricube_kernel(x)
y_normal = normal_kernel(x)
y_rectangular = rectangular_kernel(x)

# Plot the kernel functions
plt.figure(figsize=(12, 6), dpi=600)
plt.plot(x, y_tricube, 'r', linestyle='-', alpha=0.5, label='Tricube')
plt.plot(x, y_normal, 'b--', label='Normal')
plt.plot(x, y_rectangular, 'k', linewidth=2, label='Rectangular')

# Highlight kernel regression concept
plt.axvline(0, color='gray', linestyle=':', alpha=0.6)
plt.text(0.05, 0.4, 'Higher weight near $x_0$', fontsize=18, color='black', alpha=0.7)
plt.text(1.1, 0.1, 'Lower weight far from $x_0$', fontsize=18, color='black', alpha=0.7)

# Labels and legend
plt.xlabel('$x - x_0$', fontsize=15)
plt.ylabel('Kernel weight', fontsize=15)
plt.title('Kernel Functions for Kernel Regression', fontsize=14)
plt.legend(loc='upper left')
plt.grid(True, linestyle=':', alpha=0.5)
plt.savefig('../figures/loess_kernel.pdf', transparent=True)
plt.show()

# 2. Synthetic Dataset Generation with Linear Model

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Set seed for reproducibility
np.random.seed(420)

# Step 2: Generate synthetic data
n_samples = 1000  # Number of samples
n_features = 6    # Number of features

# Generate random feature values (X) from a normal distribution
X = np.random.randn(n_samples, n_features)

# Define coefficients (betas) for the linear model
betas = np.array([1.5, -2.0, 0.8, 1.2, -0.5, 0.3])

# Generate random noise (epsilon)
epsilon = np.random.normal(0, 1, n_samples)

# Compute the response variable Y using the linear equation
Y = np.dot(X, betas) + epsilon

# Step 3: Create a DataFrame
columns = [f'X{i+1}' for i in range(n_features)]  # Feature names
data = pd.DataFrame(X, columns=columns)
data['Y'] = Y  # Add the response variable to the DataFrame

# Step 4: Compute the correlation matrix
correlation_matrix = data.corr()

# Step 5: Plot the correlation matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', cbar=True)
#plt.title('Correlation Matrix Heatmap')
plt.savefig('../figures/heatmap_interaction.pdf', transparent=True)
plt.show()

# 3. Residual Plotting


In [None]:
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Serif'
plt.rcParams['font.size'] = 22

# Generate x values
x = np.linspace(500, 1000, 100)

# Create a line equation
y = 4.0*x + 7.0

# Add Gaussian noise
mu = 0  # Mean of the Gaussian noise
sigma = 20  # Standard deviation of the Gaussian noise
noise = np.random.normal(mu, sigma, y.shape)
y_noisy = y + noise

## Make a plot
# Plot the results
fig, ax = plt.subplots(figsize=(10, 6))
plt.scatter(x, y_noisy)
fig.patch.set_alpha(0.0)
plt.xlabel('House Size (sq ft)')
plt.ylabel('House Price (100K USD)')
plt.show()



In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
plt.rcParams['font.size'] = 25
linearModel = LinearRegression()
linearModel.fit(x.reshape(-1, 1), y_noisy)
yhat = linearModel.predict(x.reshape(-1, 1))
error = yhat - y_noisy
fig, ax = plt.subplots(figsize=(12, 8))
plt.scatter(x, error)
fig.patch.set_alpha(0.0)
plt.xlabel('Predictor $x$', fontsize=25)
plt.ylabel('Residual $e$', fontsize=25)
plt.grid(which='both')
plt.savefig('../figures/normal_residual_plot.pdf', transparent=True)
plt.show()


In [None]:
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Assume X and y are your data
x = sm.add_constant(x)  # Adds a constant term to the predictor
model = sm.OLS(y_noisy, x).fit()

# Get the residuals
residuals = model.resid

# Create Q-Q plot
plt.figure(figsize=(4, 8))
sm.qqplot(residuals, line='s')  # 's' adds a reference line
plt.title('Q-Q Plot of Residuals')
plt.savefig('../figures/qq_plot.pdf', transparent=True)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
plt.rcParams['font.size'] = 25

# Set random seed for reproducibility
np.random.seed(42)

# Simulate data for Homoscedasticity
X_homoscedastic = np.random.uniform(0, 10, 100).reshape(-1, 1)
y_homoscedastic = 2 * X_homoscedastic.squeeze() + np.random.normal(0, 1, 100)

# Fit linear regression model
model_homoscedastic = LinearRegression()
model_homoscedastic.fit(X_homoscedastic, y_homoscedastic)
y_pred_homoscedastic = model_homoscedastic.predict(X_homoscedastic)
residuals_homoscedastic = y_homoscedastic - y_pred_homoscedastic

# Simulate data for Heteroscedasticity
X_heteroscedastic = np.random.uniform(0, 10, 100).reshape(-1, 1)
y_heteroscedastic = 2 * X_heteroscedastic.squeeze() + np.random.normal(0, X_heteroscedastic.squeeze())

# Fit linear regression model
model_heteroscedastic = LinearRegression()
model_heteroscedastic.fit(X_heteroscedastic, y_heteroscedastic)
y_pred_heteroscedastic = model_heteroscedastic.predict(X_heteroscedastic)
residuals_heteroscedastic = y_heteroscedastic - y_pred_heteroscedastic

# Plotting
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Homoscedasticity plot
axes[0].scatter(X_homoscedastic, residuals_homoscedastic, color='blue', alpha=0.6)
axes[0].axhline(0, color='red', linestyle='--', linewidth=1)
axes[0].set_title('Homoscedasticity')
axes[0].set_xlabel('X')
axes[0].set_ylabel('Residuals')

# Heteroscedasticity plot
axes[1].scatter(X_heteroscedastic, residuals_heteroscedastic, color='green', alpha=0.6)
axes[1].axhline(0, color='red', linestyle='--', linewidth=1)
axes[1].set_title('Heteroscedasticity')
axes[1].set_xlabel('X')
axes[1].set_ylabel('Residuals')

plt.tight_layout()
plt.savefig('../figures/Homoscedasticity_vs_Heteroscedasticity.pdf', transparent=True)
plt.show()

# 2. Multiple Linear Regression using Scikit-Learn


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Serif'
plt.rcParams['font.size'] = 15

## Reading the Data From Github

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/rahulbhadani/CPE490_590_Sp2025/refs/heads/master/Data/Concrete_Compressive_Strength/Concrete_Data.csv')
df.columns

### Rename columns to more accessible format

In [None]:
# Define a dictionary mapping the old column names to the new ones
new_column_names = {
    'Cement (component 1)(kg in a m^3 mixture)': 'Cement_Amount',
    'Blast Furnace Slag (component 2)(kg in a m^3 mixture)': 'Blast_Furnace_Slag_Amount',
    'Fly Ash (component 3)(kg in a m^3 mixture)': 'Fly_Ash_Amount',
    'Water  (component 4)(kg in a m^3 mixture)': 'Water_Amount',
    'Superplasticizer (component 5)(kg in a m^3 mixture)': 'Superplasticizer_Amount',
    'Coarse Aggregate  (component 6)(kg in a m^3 mixture)': 'Coarse_Aggregate_Amount',
    'Fine Aggregate (component 7)(kg in a m^3 mixture)': 'Fine_Aggregate_Amount',
    'Age (day)': 'Age',
    'Concrete compressive strength(MPa, megapascals) ': 'Concrete_Strength'
}

# Rename the columns using the rename method
df.rename(columns=new_column_names, inplace=True)

# Check the updated column names
print(df.columns)

In [None]:
df.head()

## We are only going to use three features


In [None]:
df_filtered = df[['Cement_Amount', 'Blast_Furnace_Slag_Amount','Fly_Ash_Amount']]
y = df[['Concrete_Strength']]
# Separate features and labels
x = df_filtered.values.astype(np.float64)
y = y.values.reshape(-1, 1).astype(np.float64)

# Visualize features vs response

In [None]:
import plotly.graph_objects as go
import numpy as np

# Assuming df_filtered is a DataFrame and y is already a NumPy array
x = df_filtered.values.astype(np.float64)  # Convert DataFrame to NumPy array
y = y.astype(np.float64)  # Ensure y is a NumPy array of type float64

# Create a 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=x[:, 0],  # Cement_Amount
    y=x[:, 1],  # Blast_Furnace_Slag_Amount
    z=y[:, 0],  # Concrete_Strength
    mode='markers',
    marker=dict(
        size=5,
        color=y[:, 0],  # Color by Charge Capacity
        colorscale='Viridis',  # Choose a colorscale
        opacity=0.8
    )
)])

# Set layout
fig.update_layout(
    scene=dict(
        xaxis_title='Cement (component 1)(kg in a m^3 mixture)',
        yaxis_title='Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
        zaxis_title='Concrete compressive strength(MPa, megapascals) '
    ),
    title='3D Plot of Current and Voltage vs Charge Capacity'
)

# Show plot
fig.show()

In [None]:
plt.scatter(x[:, 0], y[:, 0])
plt.xlabel('Cement_Amount')
plt.ylabel('Concrete Strength')

In [None]:
plt.scatter(x[:, 1], y[:, 0])
plt.xlabel('Blast_Furnace_Slag_Amount')
plt.ylabel('Concrete Strength')

In [None]:
plt.scatter(x[:, 2], y[:, 0])
plt.xlabel('Fly_Ash_Amount')
plt.ylabel('Concrete Strength')

## Split the Dataset into Training and Testing

In [None]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(x, y, test_size = 1/3, random_state = 0)


# Fitting Simple Linear Regression to the training set

In [None]:
regressor = LinearRegression()
regressor.fit(X_Train, Y_Train)

# Coefficients

In [None]:
regressor.coef_, regressor.intercept_

# Mean Squared Error on Training Data

In [None]:
import sklearn.metrics as sm
# error
Y_Pred = regressor.predict(X_Train)

e= sm.mean_squared_error(Y_Train, Y_Pred)
print("MSE = {}".format(e))

# Mean Squared Error on Test Data

In [None]:
import sklearn.metrics as sm
# error
Y_Pred = regressor.predict(X_Test)
e= sm.mean_squared_error(Y_Test, Y_Pred)
print("MSE = {}".format(e))

# L2 Regularization

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
ridgeModel = Lasso(alpha = 5)
ridgeModel.fit(X_Train, Y_Train)
ridgeModel.score(X_Test, Y_Test)

# MSE on Training Set

In [None]:
import sklearn.metrics as sm
# error
Y_Pred = ridgeModel.predict(X_Train)

e= sm.mean_squared_error(Y_Train, Y_Pred)
print("MSE = {}".format(e))

# MSE on Test Set

In [None]:
import sklearn.metrics as sm
# error
Y_Pred = ridgeModel.predict(X_Test)

e= sm.mean_squared_error(Y_Test, Y_Pred)
print("MSE = {}".format(e))

# 3. Multiple Regression using StatsModel

In [None]:
import statsmodels.api as sm 
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Serif'
plt.rcParams['font.size'] = 15

## Read the data

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/rahulbhadani/CPE490_590_Sp2025/refs/heads/master/Data/Advertising/Advertising.csv', index_col=0)
df.columns

The dataset contains TV Budget, Radio Budget and Newspaper Budget for an advertisement of a product at a company and Sales.

Our goal is to predict sales based on TV Budget, Radio Budget and Newspaper Budget

## Plot the Data


In [None]:
import matplotlib.pyplot as plt

# Create a figure with 1 row and 3 columns of subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))  # Adjust figsize for better visualization

# Define colors, marker size, and other properties
colors = ['blue', 'green', 'red']  # Different colors for each subplot
marker_size = 50  # Marker size
marker_edge_color = 'black'  # Edge color of markers
marker_face_colors = ['lightblue', 'lightgreen', 'pink']  # Face colors for markers

# Scatter plot for TV vs Sales
axes[0].scatter(df['TV'], df['Sales'], s=marker_size, c=marker_face_colors[0], 
                edgecolor=marker_edge_color, label='TV vs Sales')
axes[0].set_title('TV vs Sales', fontsize=14)
axes[0].set_xlabel('TV Advertising Budget', fontsize=12)
axes[0].set_ylabel('Sales', fontsize=12)
axes[0].grid(True, linestyle='--', alpha=0.6)

# Scatter plot for Radio vs Sales
axes[1].scatter(df['Radio'], df['Sales'], s=marker_size, c=marker_face_colors[1], 
                edgecolor=marker_edge_color, label='Radio vs Sales')
axes[1].set_title('Radio vs Sales', fontsize=14)
axes[1].set_xlabel('Radio Advertising Budget', fontsize=12)
axes[1].set_ylabel('Sales', fontsize=12)
axes[1].grid(True, linestyle='--', alpha=0.6)

# Scatter plot for Newspaper vs Sales
axes[2].scatter(df['Newspaper'], df['Sales'], s=marker_size, c=marker_face_colors[2], 
                edgecolor=marker_edge_color, label='Newspaper vs Sales')
axes[2].set_title('Newspaper vs Sales', fontsize=14)
axes[2].set_xlabel('Newspaper Advertising Budget', fontsize=12)
axes[2].set_ylabel('Sales', fontsize=12)
axes[2].grid(True, linestyle='--', alpha=0.6)

# Add some spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()

## Split the Dataset into Training and Testing

In [None]:
df_filtered = df[['TV', 'Radio', 'Newspaper']]
y = df[['Sales']]
# Separate features and labels
x = df_filtered.values.astype(np.float64)
y = y.values.reshape(-1, 1).astype(np.float64)

X_Train, X_Test, Y_Train, Y_Test = train_test_split(x, y, test_size = 1/3, random_state = 0)


## Fitting Simple Linear Regression to the training set

In [None]:
xt = sm.add_constant(X_Train) 
est = sm.OLS(Y_Train, xt).fit() 

## Print the Summary of the Result

In [None]:
est.summary()

## Summary
In the above result, we see that $R^2$ coefficient of determination was $0.907$,and estimated coefficients were $w_0 = 2.9038$, $w_1 = 0.0443	$, $w_2 = 0.1966$, and $w_3 = 0.0026$.
We can also see their respective 95% confidence interval as [2.175,	3.632], [0.041,	0.048], [0.178,	0.216], and [-0.011,	0.017].

Note: the answer might be different if rerun the notebook and training and test split will happen randomly everytime the whole notebook is run

If any feature has a high p-value (>0.05), it might not be contributing significantly to the prediction of Sales. Looking at P > |t|, we see that P-value is 0.712 for x3 which means Newspaper is not contributing significantly to sales

## Mean Squared Error on Training Dataset

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred = est.predict(xt)
# Calculate the Mean Squared Error
mse = mean_squared_error(Y_Train.reshape(-1,), y_pred)
print(mse)

## Residual Analysis

In [None]:
# Residuals on training data
residuals = Y_Train.reshape(-1,) - y_pred

# Plot residuals vs fitted values
plt.scatter(y_pred, residuals, color='blue', edgecolor='black')
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Residuals vs Fitted Values', fontsize=14)
plt.xlabel('Fitted Values', fontsize=12)
plt.ylabel('Residuals', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

No funnel shaped residual, hence it doesn't violate the assumption of constant variance in linear regression.

## QQ Plot

In [None]:
import scipy.stats as stats

# Q-Q plot for residuals
stats.probplot(residuals.ravel(), dist="norm", plot=plt)
plt.title('Q-Q Plot of Residuals', fontsize=14)
plt.show()

There doesn't seem strong deviation from normality of residuals assumptions

## Prediction on the test data

In [None]:
xtest = sm.add_constant(X_Test)

# Make a prediction
y_pred = est.predict(xtest)

## Mean Squared Error on test Dataset

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Calculate the Mean Squared Error
mse = mean_squared_error(Y_Test, y_pred)
print(mse)

## Feature Importance
Feature importance refers to techniques that calculate a score for all input features in a machine learning model. These scores represent how useful or valuable each feature is in predicting the target variable. Higher value means more important feature.

In [None]:
# Feature importance from Lasso
feature_importance = pd.DataFrame({
    'Feature': ['Constant', 'TV', 'Radio', 'Newspaper'],
    'Coefficient': est.params
})
print(feature_importance)

## Cross-validation MSE
In k-fold cross-validation , the dataset is divided into k subsets (folds). The model is trained on k−1 folds and tested on the remaining fold.
This process is repeated k times, with each fold serving as the test set once. The MSE is then averaged across all k iterations.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

# Using scikit-learn for cross-validation
lr = LinearRegression()
scores = cross_val_score(lr, x, y, cv=5, scoring='neg_mean_squared_error')
mse_scores = -scores
print(f"Cross-validated MSE: {mse_scores.mean()}")

## L2 Regularization

In [None]:
# Apply Ridge (L2) regularization
ridge_model = sm.OLS(Y_Train, xt)
ridge_results = ridge_model.fit_regularized(method='elastic_net', alpha=10.0, L1_wt=0.0)  # L1_wt=0 for Ridge


# Predict on training and test data
y_train_pred_ridge = ridge_results.predict(xt)
y_test_pred_ridge = ridge_results.predict(sm.add_constant(X_Test))

# Calculate MSE for Ridge
mse_train_ridge = mean_squared_error(Y_Train, y_train_pred_ridge)
mse_test_ridge = mean_squared_error(Y_Test, y_test_pred_ridge)
print(f"Ridge Training MSE: {mse_train_ridge}")
print(f"Ridge Testing MSE: {mse_test_ridge}")

## L1 Regularization

In [None]:
# Apply Lasso (L1) regularization
lasso_model = sm.OLS(Y_Train, xt)
lasso_results = lasso_model.fit_regularized(method='elastic_net', alpha=0.0, L1_wt=40.0)  # L1_wt=1 for Lasso

# Predict on training and test data
y_train_pred_lasso = lasso_results.predict(xt)
y_test_pred_lasso = lasso_results.predict(sm.add_constant(X_Test))

# Calculate MSE for Lasso
mse_train_lasso = mean_squared_error(Y_Train, y_train_pred_lasso)
mse_test_lasso = mean_squared_error(Y_Test, y_test_pred_lasso)
print(f"Lasso Training MSE: {mse_train_lasso}")
print(f"Lasso Testing MSE: {mse_test_lasso}")

# 4.Implementation of Multiple Linear Regression Using PyTorch

## Method 1

## Load Dataset

In [None]:
# Load the dataset
import torch
df = pd.read_csv('https://raw.githubusercontent.com/rahulbhadani/CPE490_590_Sp2025/refs/heads/master/Data/Advertising/Advertising.csv', index_col=0)

# Features and target
X = df[['TV', 'Radio', 'Newspaper']].values.astype(np.float32)
y = df['Sales'].values.astype(np.float32)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train)
y_train_tensor = torch.tensor(y_train).view(-1, 1)  # Reshape to column vector
X_test_tensor = torch.tensor(X_test)
y_test_tensor = torch.tensor(y_test).view(-1, 1)

## Define the model

In [None]:
# Initialize weights and bias
input_dim = X_train.shape[1]  # Number of features
W = torch.randn(input_dim, 1, requires_grad=True, dtype=torch.float32)  # Random initialization for weights
b = torch.randn(1, requires_grad=True, dtype=torch.float32)            # Random initialization for bias

# Define the linear regression model
def model(X):
    return X @ W + b  # Matrix multiplication: X @ W + b

## Define the Loss Function

In [None]:
# Define Mean Squared Error loss
def mse_loss(Y_pred, Y_true):
    return torch.mean((Y_pred - Y_true) ** 2)

## Training Procedure

In [None]:
# Hyperparameters
learning_rate = 0.00001
num_epochs = 1000000

# Training loop
for epoch in range(num_epochs):
    # Forward pass: Compute predictions
    Y_pred = model(X_train_tensor)

    # Compute loss
    loss = mse_loss(Y_pred, y_train_tensor)

    # Backward pass: Compute gradients
    loss.backward()

    # Update weights and bias manually
    with torch.no_grad():  # Disable gradient tracking during updates
        W -= learning_rate * W.grad
        b -= learning_rate * b.grad

    # Zero gradients for the next iteration
    W.grad.zero_()
    b.grad.zero_()

    # Print progress every 100 epochs
    if (epoch + 1) % 1000 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [None]:
# Evaluate on test data
with torch.no_grad():  # Disable gradient computation
    Y_test_pred = model(X_test_tensor)
    test_loss = mse_loss(Y_test_pred, y_test_tensor)
    print(f'Test MSE: {test_loss.item():.4f}')

In [None]:
W, b

## Method 2

## Define the Model

In [135]:
import torch.nn as nn
class LinearRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LinearRegressionModel, self).__init__()
        # Initialize weights and bias
        self.weights = nn.Parameter(torch.randn(input_dim, 1))  # Random initialization
        self.bias = nn.Parameter(torch.randn(1))               # Random initialization

    def forward(self, x):
        # Perform matrix multiplication: y_pred = X @ W + b
        return x @ self.weights + self.bias

## Instantiate The Model

In [136]:
# Instantiate the model
input_dim = X_train.shape[1]  # Number of features
model = LinearRegressionModel(input_dim)

## Loss and Optimizer

In [137]:
# Define loss function
criterion = nn.MSELoss()

# Define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.00001)

## Training 

In [138]:
# Training loop
num_epochs = 100000
for epoch in range(num_epochs):
    # Forward pass: Compute predictions
    y_pred = model(X_train_tensor)

    # Compute loss
    loss = criterion(y_pred, y_train_tensor)

    # Backward pass: Compute gradients
    optimizer.zero_grad()  # Clear previous gradients
    loss.backward()        # Compute gradients

    # Update parameters
    optimizer.step()

    # Print progress every 100 epochs
    if (epoch + 1) % 10000 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [10000/100000], Loss: 2.9943
Epoch [20000/100000], Loss: 2.9688
Epoch [30000/100000], Loss: 2.9448
Epoch [40000/100000], Loss: 2.9223
Epoch [50000/100000], Loss: 2.9009
Epoch [60000/100000], Loss: 2.8808
Epoch [70000/100000], Loss: 2.8618
Epoch [80000/100000], Loss: 2.8439
Epoch [90000/100000], Loss: 2.8270
Epoch [100000/100000], Loss: 2.8110


## Evaluation

In [139]:
# Evaluate on test data
with torch.no_grad():  # Disable gradient computation
    y_test_pred = model(X_test_tensor)
    test_loss = criterion(y_test_pred, y_test_tensor)
    print(f'Test MSE: {test_loss.item():.4f}')

Test MSE: 3.6898
