In [2]:
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [3]:
bean = datasets.load_boston()
print bean.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [4]:
def load_boston():
    scaler = StandardScaler()
    boston = datasets.load_boston()
    X=boston.data
    y=boston.target
    X = scaler.fit_transform(X)
    return train_test_split(X,y)

In [5]:
X_train, X_test, y_train, y_test = load_boston()

In [6]:
X_train.shape

(379L, 13L)

In [7]:
linReg = LinearRegression()

In [8]:
linReg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
zip (y_test, linReg.predict(X_test))

[(21.100000000000001, 22.33833524429906),
 (7.2000000000000002, 10.099547787992883),
 (16.5, 21.91599118715251),
 (16.600000000000001, 15.27388484379345),
 (23.100000000000001, 23.58830205934801),
 (28.100000000000001, 25.111013549511004),
 (20.300000000000001, 18.965651330712621),
 (19.899999999999999, 18.432912579914888),
 (27.899999999999999, 32.053011283918082),
 (21.800000000000001, 21.148843430334878),
 (14.300000000000001, 14.191178550946637),
 (17.5, 15.887520275269535),
 (21.199999999999999, 21.053133828340563),
 (18.699999999999999, 21.21416478010314),
 (10.9, 14.046886907634505),
 (24.5, 20.673325822836617),
 (11.300000000000001, 14.069235481322206),
 (13.6, 14.144855748714233),
 (18.199999999999999, 18.438359712751371),
 (7.0, 8.7977905907252811),
 (22.800000000000001, 26.866574023509262),
 (50.0, 25.574901718250437),
 (33.399999999999999, 35.83896921464553),
 (16.800000000000001, 20.878227129589504),
 (23.199999999999999, 26.812782912328281),
 (24.399999999999999, 23.69342

In [10]:
#R2 score calculation

In [11]:
r2_score(y_test, linReg.predict(X_test))

0.69692313852497634

In [12]:
#MSE calculation

In [13]:
from math import sqrt
mse = mean_squared_error(y_test, linReg.predict(X_test))
print "MSE: ", mse

MSE:  24.7352690101


In [14]:
rmse = sqrt(mse)
print "RMSE: ", rmse

RMSE:  4.97345644498


# Ridge model

In [16]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [18]:
print "R2: ", r2_score(y_test, ridge.predict(X_test))
mse = mean_squared_error(y_test, ridge.predict(X_test))
print "MSE: ", mse
print "RMSE: ", sqrt(mse)

 R2:  0.697537517884
MSE:  24.6851271463
RMSE:  4.96841294039


# Optimization of Ridge model

In [19]:
optRidge = Ridge(alpha=10)
optRidge.fit(X_train, y_train)
print "R2: ", r2_score(y_test, optRidge.predict(X_test))
mse = mean_squared_error(y_test, optRidge.predict(X_test))
print "MSE: ", mse
print "RMSE: ", sqrt(mse)

R2:  0.701134089103
MSE:  24.3915971283
RMSE:  4.93878498502


In [20]:
optRidge = Ridge(alpha=7.5)
optRidge.fit(X_train, y_train)
print "R2: ", r2_score(y_test, optRidge.predict(X_test))
mse = mean_squared_error(y_test, optRidge.predict(X_test))
print "MSE: ", mse
print "RMSE: ", sqrt(mse)

R2:  0.700397472629
MSE:  24.4517152334
RMSE:  4.9448675648


# Finally

In [22]:
#The performance measured after various methods turns out to be pretty much the same for the given database.