In [2]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

In [3]:
# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

In [3]:
np.isnan(diabetes_X)

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [6]:
# Split the data into training/testing sets
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test \
        = train_test_split(diabetes_X, diabetes_y, test_size=0.2, random_state=5)

## Linear regression with out regularization

In [20]:
# Create linear regression object
regr = linear_model.LinearRegression()

In [21]:
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [22]:
# Make predictions using the test and train set
diabetes_y_pred = regr.predict(diabetes_X_test)
diabetes_y_train_pred = regr.predict(diabetes_X_train)

In [23]:
# The coefficients
print("Coefficients: \n", regr.coef_)
#Intrecept
print("Intrecept: \n", regr.intercept_)
# The mean squared error
print("Mean squared error for train set: %.2f" % mean_squared_error(diabetes_y_train, diabetes_y_train_pred))
print("Mean squared error for test set: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

Coefficients: 
 [   2.72195846 -255.94592688  522.83461574  353.10273364 -827.58494078
  543.32591808  115.93459912  214.68877404  694.94194778   32.73088487]
Intrecept: 
 152.22183644503602
Mean squared error for train set: 2845.00
Mean squared error for test set: 2981.59
Coefficient of determination: 0.53


## L2 regularization

In [15]:
# Create linear regression object
regr = linear_model.Ridge(alpha=0.1)

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the test and train set
diabetes_y_pred = regr.predict(diabetes_X_test)
diabetes_y_train_pred = regr.predict(diabetes_X_train)

# The coefficients
print("Coefficients: \n", regr.coef_)
#Intrecept
print("Intrecept: \n", regr.intercept_)
# The mean squared error
print("Mean squared error for train set: %.2f" % mean_squared_error(diabetes_y_train, diabetes_y_train_pred))
print("Mean squared error for test set: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

Coefficients: 
 [  13.35316225 -209.85870329  490.24942547  317.81042472  -68.89882806
  -36.47782293 -192.5888623   130.54227845  371.68943292   61.94914469]
Intrecept: 
 152.1799012738903
Mean squared error for train set: 2884.02
Mean squared error for test set: 3041.11
Coefficient of determination: 0.52


## L1 regularization

In [16]:
# Create linear regression object
regr = linear_model.Lasso(alpha=0.1)

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the test and train set
diabetes_y_pred = regr.predict(diabetes_X_test)
diabetes_y_train_pred = regr.predict(diabetes_X_train)

# The coefficients
print("Coefficients: \n", regr.coef_)
#Intrecept
print("Intrecept: \n", regr.intercept_)
# The mean squared error
print("Mean squared error for train set: %.2f" % mean_squared_error(diabetes_y_train, diabetes_y_train_pred))
print("Mean squared error for test set: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

Coefficients: 
 [   0.         -156.66189193  533.0647414   298.16455717   -0.
   -0.         -240.67195952    0.          399.6813174     4.03932171]
Intrecept: 
 152.2649533056627
Mean squared error for train set: 2903.50
Mean squared error for test set: 3096.86
Coefficient of determination: 0.51


## ElasticNet regularization

In [14]:
# Create linear regression object
regr = linear_model.ElasticNet(alpha=0.1)

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the test and train set
diabetes_y_pred = regr.predict(diabetes_X_test)
diabetes_y_train_pred = regr.predict(diabetes_X_train)

# The coefficients
print("Coefficients: \n", regr.coef_)
#Intrecept
print("Intrecept: \n", regr.intercept_)
# The mean squared error
print("Mean squared error for train set: %.2f" % mean_squared_error(diabetes_y_train, diabetes_y_train_pred))
print("Mean squared error for test set: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

Coefficients: 
 [ 11.02271214   0.05874494  38.52039249  28.87699834   9.96555622
   7.97201921 -23.4721427   23.81895143  32.68848971  20.88095878]
Intrecept: 
 150.94237683837974
Mean squared error for train set: 5243.52
Mean squared error for test set: 5696.73
Coefficient of determination: 0.10


## ===================================================================================================================
## Feature Selection

In [34]:
# Use only one feature
diabetes_X = diabetes_X[:, [2, 3, 4, 5, 8]]

In [35]:
diabetes_X.shape

(442, 5)

In [36]:
# Split the data into training/testing sets
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test \
        = train_test_split(diabetes_X, diabetes_y, test_size=0.2, random_state=5)

In [37]:
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [38]:
# Make predictions using the test and train set
diabetes_y_pred = regr.predict(diabetes_X_test)
diabetes_y_train_pred = regr.predict(diabetes_X_train)

In [39]:
# The coefficients
print("Coefficients: \n", regr.coef_)
#Intrecept
print("Intrecept: \n", regr.intercept_)
# The mean squared error
print("Mean squared error for train set: %.2f" % mean_squared_error(diabetes_y_train, diabetes_y_train_pred))
print("Mean squared error for test set: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

Coefficients: 
 [ 572.59824734  286.38753621 -560.5860924   406.79874277  664.99149741]
Intrecept: 
 152.5058971540801
Mean squared error for train set: 2960.11
Mean squared error for test set: 3072.13
Coefficient of determination: 0.51


===================================================================================================================

In [13]:
#Non-linear fields
diabetes_X_ = PolynomialFeatures(degree=4, include_bias=False).fit_transform(diabetes_X)

In [14]:
diabetes_X_.shape

(442, 1000)

In [15]:
# Split the data into training/testing sets
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test \
        = train_test_split(diabetes_X_, diabetes_y, test_size=0.2, random_state=5)

In [16]:
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
# Make predictions using the test and train set
diabetes_y_pred = regr.predict(diabetes_X_test)
diabetes_y_train_pred = regr.predict(diabetes_X_train)

In [18]:
# The coefficients
# print("Coefficients: \n", regr.coef_)
#Intrecept
print("Intrecept: \n", regr.intercept_)
# The mean squared error
print("Mean squared error for train set: %.2f" % mean_squared_error(diabetes_y_train, diabetes_y_train_pred))
print("Mean squared error for test set: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

Intrecept: 
 23.941460715169427
Mean squared error for train set: 0.00
Mean squared error for test set: 330631.27
Coefficient of determination: -51.43
