# Linear Regression Example¶

This example uses the only the first feature of the diabetes dataset, in order to illustrate a two-dimensional plot of this regression technique. The straight line can be seen in the plot, showing how linear regression attempts to draw a straight line that will best minimize the residual sum of squares between the observed responses in the dataset, and the responses predicted by the linear approximation.


The coefficients, the residual sum of squares and the variance score are also calculated.

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model

In [3]:
# Load the diabetes dataset
diabetes = datasets.load_diabetes()

print diabetes

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ..., 
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]]), 'target': array([ 151.,   75.,  141.,  206.,  135.,   97.,  138.,   63.,  110.,
        310.,  101.,   69.,  179.,  185.,  118.,  171.,  166.,  144.,
         97.,  168.,   68.,   49.,   68.,  245.,  184.,  202.,  137.,
         85.,  131.,  283.,  129.,   59.,  341.,   87.,   65.,  102.,
        265.,  276.,  252.,   90.,  100.,   55.,   61.,   92.,  259.,
         53.,  1

In [4]:
print diabetes.keys()

['data', 'target']


In [5]:
print diabetes.data.shape

(442L, 10L)


In [6]:
diabetes_df=pd.DataFrame(diabetes.data)
print diabetes_df

            0         1         2         3         4         5         6  \
0    0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1   -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2    0.085299  0.050680  0.044451 -0.005671 -0.045599 -0.034194 -0.032356   
3   -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4    0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   
5   -0.092695 -0.044642 -0.040696 -0.019442 -0.068991 -0.079288  0.041277   
6   -0.045472  0.050680 -0.047163 -0.015999 -0.040096 -0.024800  0.000779   
7    0.063504  0.050680 -0.001895  0.066630  0.090620  0.108914  0.022869   
8    0.041708  0.050680  0.061696 -0.040099 -0.013953  0.006202 -0.028674   
9   -0.070900 -0.044642  0.039062 -0.033214 -0.012577 -0.034508 -0.024993   
10  -0.096328 -0.044642 -0.083808  0.008101 -0.103389 -0.090561 -0.013948   
11   0.027178  0.050680  0.017506 -0.033214 -0.007073  0.045972 -0.065491   

In [7]:
# Use only one feature
diabetes_X = diabetes.data[:, np.newaxis, 2]

In [8]:
a = [1,2,3,4,5,6,7,8,9]

#a[:-2]
a[-2:]


[8, 9]

In [9]:
# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]    #except last 20 elements from the list: 
diabetes_X_test = diabetes_X[-20:]    # l2st 20 element from the list only: 

In [10]:
# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

In [11]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
# The coefficients
print 'Coefficients:', regr.coef_

# The mean squared error
#In statistics, the mean squared error (MSE) or mean squared deviation (MSD) of an estimator 
#(of a procedure for estimating an unobserved quantity) measures the average of the squares of the errors
#or deviations—that is, the difference between the estimator and what is estimated.

print("Mean squared error: %.2f"% np.mean((regr.predict(diabetes_X_test) 
                                           - diabetes_y_test) ** 2))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(diabetes_X_test, diabetes_y_test))

print 'No of Coefficient:', len(regr.coef_)

print 'Estimated intercept coefficent:', regr.intercept_

# R-sqaured it's the percentage of the reponse variable that is explained by a linear model. 

# r-squared is always between 0 and 100%
print "R^2 =", regr.score(diabetes_X_test, diabetes_y_test)


Coefficients: [ 938.23786125]
Mean squared error: 2548.07
Variance score: 0.47
No of Coefficient: 1
Estimated intercept coefficent: 152.918861826
R^2 = 0.472575447982


In [None]:
# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, regr.predict(diabetes_X_test), color='blue',
         linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()