In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [2]:
bean = datasets.load_boston()
print bean.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [4]:
def load_boston():
    scaler = StandardScaler()
    boston = datasets.load_boston()
    X = boston.data
    y = boston.target
    X = scaler.fit_transform(X)
    return train_test_split(X,y)    

In [9]:
X_train, X_test, y_train, y_test = load_boston() 

In [10]:
X_train.shape

(379L, 13L)

In [11]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [12]:
zip (y_test, clf.predict(X_test))

[(50.0, 29.888923920207262),
 (10.4, 6.2464110863974689),
 (21.5, 20.998819425067648),
 (43.5, 39.65687711959184),
 (22.5, 28.77653330951626),
 (15.0, 20.775595200822806),
 (19.199999999999999, 23.228324047903424),
 (18.899999999999999, 21.984311458606705),
 (14.9, 15.425330083513803),
 (25.0, 28.198785434301637),
 (7.5, 14.408219420481796),
 (22.199999999999999, 20.054143950529529),
 (13.4, 14.194713393642271),
 (21.199999999999999, 21.315158287144289),
 (18.800000000000001, 20.823202332304575),
 (26.399999999999999, 29.149631579252812),
 (26.699999999999999, 32.337385979066987),
 (19.5, 17.429643196503743),
 (13.300000000000001, 21.295851978255008),
 (37.299999999999997, 33.999821833048479),
 (37.600000000000001, 38.431474064160682),
 (17.399999999999999, 17.683215376317179),
 (22.0, 27.742232453687123),
 (12.5, 19.788024035255916),
 (10.199999999999999, 6.4181230691686864),
 (10.800000000000001, 11.647491214180217),
 (8.4000000000000004, 15.52216717495231),
 (20.300000000000001, 20.

In [14]:
mean_squared_error (y_test, clf.predict(X_test))

30.92870725918392

In [15]:
r2_score (y_test, clf.predict(X_test))

0.64969372399382808

In [20]:
from sklearn.linear_model import Lasso
l = Lasso(alpha=0.1)
l.fit(X_train, y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [23]:
zip (y_test, l.predict(X_test))


[(50.0, 28.95399726962782),
 (10.4, 6.3429258371902613),
 (21.5, 21.968593415587314),
 (43.5, 38.860816658638029),
 (22.5, 28.693204076018731),
 (15.0, 21.588999652851228),
 (19.199999999999999, 23.638601273754855),
 (18.899999999999999, 22.606647238614919),
 (14.9, 15.85396597074415),
 (25.0, 27.025316268934006),
 (7.5, 14.499487363375859),
 (22.199999999999999, 20.265585900547112),
 (13.4, 14.522735074611958),
 (21.199999999999999, 21.404700919618747),
 (18.800000000000001, 21.145898186735227),
 (26.399999999999999, 28.405597318956467),
 (26.699999999999999, 31.502818041738088),
 (19.5, 17.569255015081946),
 (13.300000000000001, 20.863775432709883),
 (37.299999999999997, 32.506929905235729),
 (37.600000000000001, 38.10650892759169),
 (17.399999999999999, 18.212548840746713),
 (22.0, 28.004404438764865),
 (12.5, 19.346407833273076),
 (10.199999999999999, 6.0078852897504937),
 (10.800000000000001, 11.69977292090185),
 (8.4000000000000004, 15.651210242507931),
 (20.300000000000001, 20.4

In [28]:
mean_squared_error( y_test, l.predict(X_test) )

32.611495269637821

In [25]:
r2_score(y_test, l.predict(X_test))

0.63063404599597406