In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [2]:
bean = datasets.load_boston()
print bean.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [3]:
def load_boston():
    scaler = StandardScaler()
    boston = datasets.load_boston()
    X=boston.data
    y=boston.target
    X = scaler.fit_transform(X)
    return train_test_split(X,y)

In [4]:
X_train, X_test, y_train, y_test = load_boston()

In [5]:
X_train.shape

(379L, 13L)

linear regression

In [7]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
zip (y_test, clf.predict(X_test))

[(18.5, 19.026883182538114),
 (24.300000000000001, 24.013517862017387),
 (48.799999999999997, 41.306153114472281),
 (21.199999999999999, 21.054388784864123),
 (22.5, 22.713672687126817),
 (33.200000000000003, 35.464621958837554),
 (8.5, 9.4802542748025971),
 (23.100000000000001, 22.183560337415162),
 (22.199999999999999, 24.242300747091626),
 (18.199999999999999, 18.929446270526892),
 (21.699999999999999, 22.171071576955701),
 (21.199999999999999, 23.214968821889034),
 (23.300000000000001, 24.817028720339518),
 (11.5, 13.730000147449005),
 (50.0, 19.537604174529886),
 (36.200000000000003, 27.508395455659333),
 (8.3000000000000007, 13.014943461550752),
 (26.600000000000001, 23.010441206148954),
 (21.399999999999999, 20.019632004337417),
 (5.5999999999999996, 13.444835061895725),
 (33.799999999999997, 34.962854270388114),
 (17.899999999999999, 1.4831024558657191),
 (17.600000000000001, 15.800400394123439),
 (18.699999999999999, 17.996261726146475),
 (24.300000000000001, 20.39969544847364

In [9]:
y_pred=clf.predict(X_test)
mean_squared_error(y_test, y_pred)

33.223107996098733

In [10]:
y_pred

array([ 19.02688318,  24.01351786,  41.30615311,  21.05438878,
        22.71367269,  35.46462196,   9.48025427,  22.18356034,
        24.24230075,  18.92944627,  22.17107158,  23.21496882,
        24.81702872,  13.73000015,  19.53760417,  27.50839546,
        13.01494346,  23.01044121,  20.019632  ,  13.44483506,
        34.96285427,   1.48310246,  15.80040039,  17.99626173,
        20.39969545,  12.68806677,  20.07826111,  15.92812448,
        24.58248141,  43.2335485 ,  13.74159771,  16.72613967,
        15.20744301,  17.10505781,  21.44150526,  19.59101338,
        10.18447438,  24.90412369,  30.82010327,  13.85808974,
        22.80517508,  35.48012707,  23.5691943 ,  16.08424423,
        19.70395982,  16.94502509,  24.45687184,  13.60624964,
        10.31439627,  24.46196151,  32.63084925,  31.95410008,
        35.4176508 ,  28.46302703,  15.96720107,  14.32951989,
        19.48977688,  32.74051594,  17.13167478,  25.96160871,
        19.43621626,  26.37989787,  25.03988127,  19.98

R2 score

In [11]:
r2_score(y_test, y_pred)

0.6655406314121779

implementing sklearn.linear_model.Ridge, adjusting the Linear parameter, source: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

In [27]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV

In [28]:
clearFigRidge = Ridge(alpha=1.5)
clearFigRidge.fit(X_train, y_train)

Ridge(alpha=1.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [29]:
zip (y_test, clf.predict(X_test))

[(18.5, 19.026883182538114),
 (24.300000000000001, 24.013517862017387),
 (48.799999999999997, 41.306153114472281),
 (21.199999999999999, 21.054388784864123),
 (22.5, 22.713672687126817),
 (33.200000000000003, 35.464621958837554),
 (8.5, 9.4802542748025971),
 (23.100000000000001, 22.183560337415162),
 (22.199999999999999, 24.242300747091626),
 (18.199999999999999, 18.929446270526892),
 (21.699999999999999, 22.171071576955701),
 (21.199999999999999, 23.214968821889034),
 (23.300000000000001, 24.817028720339518),
 (11.5, 13.730000147449005),
 (50.0, 19.537604174529886),
 (36.200000000000003, 27.508395455659333),
 (8.3000000000000007, 13.014943461550752),
 (26.600000000000001, 23.010441206148954),
 (21.399999999999999, 20.019632004337417),
 (5.5999999999999996, 13.444835061895725),
 (33.799999999999997, 34.962854270388114),
 (17.899999999999999, 1.4831024558657191),
 (17.600000000000001, 15.800400394123439),
 (18.699999999999999, 17.996261726146475),
 (24.300000000000001, 20.39969544847364

 R2 and MSE after applying Ridge

In [30]:
clf2 = LinearRegression()
clf2.fit(X_test, y_test)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [31]:
y_pred2=clf2.predict(X_test)
mean_squared_error(y_test, y_pred)

33.223107996098733

In [32]:
y_pred2

array([ 20.78184851,  25.82085878,  40.21373708,  22.17103992,
        20.90155752,  35.50676927,   4.24368968,  25.23236423,
        24.08387107,  20.33351563,  22.66428304,  22.81481559,
        28.48126494,  14.6655586 ,  34.29382767,  28.48698771,
        13.64721584,  19.82368112,  20.30372605,   8.35287869,
        34.53899533,   3.96347959,  18.04569725,  17.42319245,
        19.08537693,  19.16070296,  20.04622287,  15.95305939,
        28.37618599,  42.35260512,   8.88857824,  22.3383049 ,
        19.19822979,  12.88752353,  24.87314327,  18.74276207,
         4.95042466,  21.79417695,  29.4198353 ,  12.44156037,
        29.97289001,  36.94252304,  19.64591535,  20.25989443,
        20.24112036,  19.58800917,  25.15555977,   9.42860293,
        16.17032856,  23.38567256,  34.02237422,  40.3024889 ,
        37.35336245,  31.29128726,  14.96862264,  20.37086991,
        19.58717493,  32.7500989 ,  16.9057139 ,  26.06678181,
        21.24452866,  28.54717933,  24.19725258,  17.10

In [36]:
clearFigRidgeCV = RidgeCV(alphas=[-.2, .1,], store_cv_values = True)
clearFigRidgeCV.fit(X_train, y_train)

RidgeCV(alphas=[-0.2, 0.1], cv=None, fit_intercept=True, gcv_mode=None,
    normalize=False, scoring=None, store_cv_values=True)

In [37]:
zip (y_test, clf.predict(X_test))

[(18.5, 19.026883182538114),
 (24.300000000000001, 24.013517862017387),
 (48.799999999999997, 41.306153114472281),
 (21.199999999999999, 21.054388784864123),
 (22.5, 22.713672687126817),
 (33.200000000000003, 35.464621958837554),
 (8.5, 9.4802542748025971),
 (23.100000000000001, 22.183560337415162),
 (22.199999999999999, 24.242300747091626),
 (18.199999999999999, 18.929446270526892),
 (21.699999999999999, 22.171071576955701),
 (21.199999999999999, 23.214968821889034),
 (23.300000000000001, 24.817028720339518),
 (11.5, 13.730000147449005),
 (50.0, 19.537604174529886),
 (36.200000000000003, 27.508395455659333),
 (8.3000000000000007, 13.014943461550752),
 (26.600000000000001, 23.010441206148954),
 (21.399999999999999, 20.019632004337417),
 (5.5999999999999996, 13.444835061895725),
 (33.799999999999997, 34.962854270388114),
 (17.899999999999999, 1.4831024558657191),
 (17.600000000000001, 15.800400394123439),
 (18.699999999999999, 17.996261726146475),
 (24.300000000000001, 20.39969544847364

In [39]:
clearFigRidge0 = Ridge(alpha=2.0)
clearFigRidge0.fit(X_train, y_train)

Ridge(alpha=2.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [42]:
print 'Ridge r2:-', r2_score(y_test, clearFigRidge0.predict(X_test))

Ridge r2:- 0.664270037488


In [43]:
print 'Ridge MSE:-', mean_squared_error(y_test, clearFigRidge0.predict(X_test))

Ridge MSE:- 33.3493208731
