# Linear regression, the usual way (using scikit-learn)

In [12]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
import numpy as np

### The diabetes dataset
Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline.

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times n_samples (i.e. the sum of squares of each column totals 1).

In [13]:
X, y = load_diabetes(True)
X.shape, y.shape

((442, 10), (442,))

In [14]:
X[:3,:]

array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187235, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, -0.02632783, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, -0.00567061, -0.04559945,
        -0.03419447, -0.03235593, -0.00259226,  0.00286377, -0.02593034]])

In [15]:
y[:3]

array([ 151.,   75.,  141.])

In [16]:
X_train = X[:300,:]
X_test = X[300:,:]

y_train = y[:300]
y_test = y[300:]

In [17]:
model = LinearRegression()
model.fit(X_train, y_train)

print("Coefficients: \n", model.coef_)

Coefficients: 
 [ -16.57607993 -254.66532396  560.98630022  278.91811152 -393.41357305
   97.05460405  -19.0023093   169.46450327  632.95050374  114.21638941]


In [25]:
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_mse = np.mean((train_preds - y_train) ** 2)
test_mse = np.mean((test_preds - y_test) ** 2)

print("Mean squared error (train/test): {} / {}".format(np.round(train_mse,2), np.round(test_mse,2)))
print("R square (train/test): {} / {}".format(np.round(model.score(X_train, y_train), 2),
                                              np.round(model.score(X_test, y_test), 2)))

Mean squared error (train/test): 2923.0 / 2794.57
R square (train/test): 0.51 / 0.51
