In [6]:

%matplotlib inline
from bokeh.plotting import figure, show, output_notebook


# Linear Regression Example

This example uses the only the first feature of the `diabetes` dataset, in
order to illustrate a two-dimensional plot of this regression technique. The
straight line can be seen in the plot, showing how linear regression attempts
to draw a straight line that will best minimize the residual sum of squares
between the observed responses in the dataset, and the responses predicted by
the linear approximation.

The coefficients, the residual sum of squares and the variance score are also
calculated.



In [7]:
print(__doc__)


# Code source: Jaques Grobler
# License: BSD 3 clause


import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
# Load the diabetes dataset
diabetes = datasets.load_diabetes()


# Use only one feature
diabetes_X = diabetes.data


Automatically created module for IPython interactive environment


In [8]:
len(diabetes_X)

442

In [14]:
# Split the data into training/testing sets

np.random.seed(0)
indices = np.random.permutation(len(diabetes_X))
diabetes_X_train = diabetes_X[indices[:-20]]
diabetes_X_test = diabetes_X[indices[-20:]]

# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[indices[:-20]]
diabetes_y_test = diabetes.target[indices[-20:]]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# The coefficients
print('Coefficients: \n', regr.coef_)

('Coefficients: \n', array([  -3.78606011, -235.47377739,  533.3485139 ,  321.69200267,
       -827.89481203,  521.12370087,  115.57077061,  159.58767508,
        742.51130661,   55.80177895]))


In [15]:
print regr.score(diabetes_X_train, diabetes_y_train)
print regr.score(diabetes_X_test, diabetes_y_test)

0.508586627784
0.637757069018


In [21]:
# manually finding the ridge parameter
alphas = np.logspace(-4,1,6)
for a in alphas:
    ridge_model = linear_model.Ridge(alpha=a)
    ridge_model.fit(diabetes_X_train, diabetes_y_train)
    print 'alpha = %f' % a,
    print 'score = %f' % ridge_model.score(diabetes_X_test,diabetes_y_test),
    print 'MSE = %f' % np.mean((ridge_model.predict(diabetes_X_test) - diabetes_y_test)**2)
    

alpha = 0.000100 score = 0.637831 MSE = 3170.878546
alpha = 0.001000 score = 0.638363 MSE = 3166.221066
alpha = 0.010000 score = 0.639188 MSE = 3158.993960
alpha = 0.100000 score = 0.628907 MSE = 3249.009010
alpha = 1.000000 score = 0.527802 MSE = 4134.204556
alpha = 10.000000 score = 0.207023 MSE = 6942.701166


In [23]:
# finding the corresponding lasso fit
lasso_model = linear_model.Lasso()

scores = [lasso_model.set_params(alpha=a
                                ).fit(diabetes_X_train, diabetes_y_train
                                ).score(diabetes_X_test, diabetes_y_test)
                         for a in alphas]



In [39]:
# find the best performing lasso parameter
print max(scores)
best_alpha = alphas[scores.index(max(scores))]
# 63.8% test accuracy

0.638494504826


In [27]:
# print the best performing model
lasso_model.alpha = best_alpha
lasso_model.fit(diabetes_X_train, diabetes_y_train)
print lasso_model.coef_

[  -0.         -222.95341234  538.68565764  315.12769556 -332.61745303
  122.65109957  -91.48915517  106.88826037  558.8842569    53.90992404]
