# Polynomial Regression -Overfitting

What if your data is actually more complex than a simple straight line? Surprisingly,
you can actually use a linear model to fit nonlinear data. A simple way to do this is to
add powers of each feature as new features, then train a linear model on this extended
set of features. This technique is called Polynomial Regression

In [None]:
# Some useful Imports

import numpy as np
import numpy.random as rnd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(42)
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

$y = 2 + X + 0.5 * X^2 + noise$

In [None]:
# Generate the dataset on which we will test various models.

m = 20
factor = 1.0
X = 6 * np.random.rand(m, 1) - 3
y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)*factor

$y = \beta_{0} + \beta_{1}*X$

In [None]:
# generate the polynomial features. In this case, since degree is 1, X_poly will be same as X.

degree = 1
poly_features = PolynomialFeatures(degree=degree, include_bias=True)
X_poly = poly_features.fit_transform(X)

In [None]:
# Do the train/test split.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.20, random_state=42)

In [None]:
# Train the model with the training set.

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [None]:
# Compute the error metrics on the training set.
y_train_hat = lin_reg.predict(X_train)

#Regression Evaluation Metrics
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_train, y_train_hat))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train, y_train_hat)))
print('R-squared:', metrics.r2_score(y_train, y_train_hat))

MAE: 1.1542273095606363
RMSE: 1.4934010143179992
R-squared: 0.49476236205076163


In [None]:
# Compute error metrics on the test set.
y_test_hat = lin_reg.predict(X_test)

print('MAE:', metrics.mean_absolute_error(y_test, y_test_hat))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_test_hat)))
print('R-squared:', metrics.r2_score(y_test, y_test_hat))

MAE: 2.416535147659223
RMSE: 2.4587481285902184
R-squared: 0.4171962387639896


$y = \beta_{0} + \beta_{1}*X + \beta_{2}* X^2 + ... + \beta_{20}* X^{20}$

In [None]:
degree = 20
poly_features = PolynomialFeatures(degree=degree, include_bias=True)
X_poly = poly_features.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.20, random_state=42)

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [None]:
y_train_hat = lin_reg.predict(X_train)

print('MAE:', metrics.mean_absolute_error(y_train, y_train_hat))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train, y_train_hat)))
print('R-squared:', metrics.r2_score(y_train, y_train_hat))

MAE: 1.6451780588816467e-06
RMSE: 2.921170152467713e-06
R-squared: 0.9999999999980669


In [None]:
y_test_hat = lin_reg.predict(X_test)

print('MAE:', metrics.mean_absolute_error(y_test, y_test_hat))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_test_hat)))
print('R-squared:', metrics.r2_score(y_test, y_test_hat))

MAE: 1403593.5287315045
RMSE: 2807028.686442214
R-squared: -759605259314.3168


$y = \beta_{0} + \beta_{1}*X + \beta_{2}*X^2$

In [None]:
degree = 2
poly_features = PolynomialFeatures(degree=degree, include_bias=True)
X_poly = poly_features.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.20, random_state=42)

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [None]:
y_train_hat = lin_reg.predict(X_train)

print('MAE:', metrics.mean_absolute_error(y_train, y_train_hat))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train, y_train_hat)))
print('R-squared:', metrics.r2_score(y_train, y_train_hat))

MAE: 0.6095683946400021
RMSE: 0.7734581075259688
R-squared: 0.8644759397446264


In [None]:
y_test_hat = lin_reg.predict(X_test)

print('MAE:', metrics.mean_absolute_error(y_test, y_test_hat))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_test_hat)))
print('R-squared:', metrics.r2_score(y_test, y_test_hat))

MAE: 1.0631945784178296
RMSE: 1.2178156394661737
R-squared: 0.8570259053089068
