# Polynomial Regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
import os
user = os.getenv('USER')
os.chdir(f'/scratch/cd82/{user}/notebooks')

In [None]:
# Generate non-linear data
N=100
np.random.seed(42)
X = 6 * np.random.rand(N, 1) - 3  # random  number between 0..1 which is scaled by 6 and offset by -3
# Create our dependent data
y = 0.5 * X**2 + X + 5 + np.random.randn(N, 1)

##### Plot the data

In [None]:
plt.figure(figsize=(4, 4))
plt.scatter(X, y, 
    color='blue', label='X,y data')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Polynomial Regression - Plot y vs X')
plt.legend()
plt.show()

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42)

#### Polynomial regression using ```sklearn```
The Scikit Learn library uses a ```Pipeline``` to create a pre-processing step before the regression fitting task.

In [None]:
# Create polynomial regression model (degree=2)
poly_model = Pipeline([
    ("poly_features", PolynomialFeatures(degree=2, include_bias=False)),
	("linear_regression", LinearRegression())
])

In [None]:
# Train the model
poly_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = poly_model.predict(X_test)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
# Plot the results

X_plot = np.linspace(-3, 3, 100).reshape(-1, 1)
y_plot = poly_model.predict(X_plot)

plt.figure(figsize=(4, 4))
plt.scatter(X_test, y_test, 
    color='blue', label='Actual data')
plt.plot(X_plot, y_plot, color='red', 
    linewidth=2, 
    label='Polynomial regression')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Polynomial Regression')
plt.legend()
plt.show()

#### Save the pipeline
As the model looks good, we can save the pipeline so it can be run on new data.

In [None]:
import joblib

# Save the pipeline to a file
joblib.dump(poly_model, 'poly_model.pkl')

# Load the pipeline from the file
loaded_poly_pipeline = joblib.load('poly_model.pkl')

# Preprocess and predict on new data
new_data = X_test # Replace with your new dataset
predictions = loaded_poly_pipeline.predict(new_data)


#### Polynomial regression using ```statsmodels```

ref.  
https://ostwalprasad.github.io/ [Polynomial-Regression-using-statsmodel.html](https://ostwalprasad.github.io/machine-learning/Polynomial-Regression-using-statsmodel.html)

The ```statsmodels``` library does not have an automated method to add polynomial terms, so we can create our own function

In [None]:
import statsmodels.api as sm

# Create a function to add the squared term to the X data
def add_sqrd_column(X: np.ndarray, degree: int, index: int=0):
    # Select the column to modify
    modified_col = X[:, index]
    # modify the data to the desired power
    square_col = modified_col ** degree
    # Add the augmented column to the original matrix
    new_matrix = np.column_stack((X, modified_col))
    # return the new matrix
    return new_matrix

# Add the squared value of our data to our training matrix
X_train_p = add_sqrd_column(X_train, 2, 0)
X_test_p = add_sqrd_column(X_test, 2, 0)
# Add a constant to the model data (intercept)
X_train_p_int = sm.add_constant(X_train_p)
X_test_p_int = sm.add_constant(X_test_p)


In [None]:
# We can also use the same Scikit Learn class PolynomialFeatures
# It automatically adds an intercept column of 1's
from sklearn.preprocessing import PolynomialFeatures
polynomial_features= PolynomialFeatures(degree=3)
X_train_p_int = polynomial_features.fit_transform(X_train)
X_test_p_int = polynomial_features.fit_transform(X_test)


import statsmodels.api as sm
model_sm = sm.OLS(y_train, X_train_p_int).fit()

In [None]:
# Make predictions
y_test_pred = model_sm.predict(X_test_p_int) 

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
# Plot the results
X_plot = np.linspace(-3, 3, 100).reshape(-1, 1)

X_plot_p_int = polynomial_features.fit_transform(X_plot)
# X_plot_p = add_sqrd_column(X_plot, 0)
# X_plot_p_int = sm.add_constant(X_plot_p)

y_plot = model_sm.predict(X_plot_p_int) 

from statsmodels.sandbox.regression.predstd import wls_prediction_std
_, upper,lower = wls_prediction_std(model_sm)

# plt.plot(X_train_p_int, upper,'--',label="Upper") # confid. intrvl
# plt.plot(X_train_p_int, lower,':',label="lower")
# plt.legend(loc='upper left')
plt.figure(figsize=(4, 4))
plt.scatter(X_test, y_test, 
    color='blue', label='Actual data')
plt.plot(X_plot, y_plot, color='red', 
    linewidth=2, 
    label='Polynomial regression')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Polynomial Regression')
plt.legend()
plt.show()

In [None]:
model_sm.summary()