###BOSTON HOUSING 



In [27]:
from sklearn import datasets
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
import numpy as np

In [3]:
bean = datasets.load_boston()
print bean.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [4]:
def load_boston():
    scaler = StandardScaler()
    boston = datasets.load_boston()
    X=boston.data
    y=boston.target
    X = scaler.fit_transform(X)
    return train_test_split(X,y)

In [5]:
X_train, X_test, y_train, y_test = load_boston()

In [6]:
X_train.shape

(379L, 13L)

###Fitting a Linear Regression


It's as easy as instantiating a new regression object (line 1) and giving your regression object your training data (line 2) by calling .fit(independent variables, dependent variable)

In [9]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)


###Making a Prediction


X_test is our holdout set of data. We know the answer (y_test) but the computer does not.

Using the command below, create a tuple for each observation, where combining the real value (y_test) with the value our regressor predicts (clf.predict(X_test))



In [14]:
zip (y_test, clf.predict(X_test))

[(23.100000000000001, 7.7658622470698973),
 (23.100000000000001, 17.059508814545133),
 (13.800000000000001, 15.688187550998407),
 (12.1, 18.683664107507575),
 (21.0, 22.63476466772152),
 (17.800000000000001, 17.52108632648871),
 (23.699999999999999, 27.34509151416362),
 (31.100000000000001, 32.399697916047614),
 (8.5, 16.265873553397292),
 (37.899999999999999, 33.451412345205391),
 (20.300000000000001, 23.068434551918411),
 (33.299999999999997, 36.839979037263127),
 (24.0, 24.81377306216136),
 (32.0, 33.450986388694815),
 (25.100000000000001, 31.604006028884179),
 (36.200000000000003, 26.963500147004712),
 (21.199999999999999, 22.696235104041296),
 (37.299999999999997, 34.197102316504001),
 (18.5, 18.838556131542234),
 (20.100000000000001, 20.131534573877623),
 (32.399999999999999, 37.367041059629386),
 (15.6, 15.313642598579509),
 (18.800000000000001, 20.465958477969764),
 (14.6, 19.255452317263369),
 (21.0, 21.008891070072629),
 (29.399999999999999, 30.690799845134062),
 (29.89999999

### MEAN SQUARED ERROR

Measuring the performance using MSE

In [17]:
mean_squared_error(y_test, clf.predict(X_test))

29.008572533724553

### R2

Measuring the performance sing r2

In [18]:
r2_score(y_test, clf.predict(X_test))

0.61503430600614295

###L2 Regularization

In [22]:
np.random.seed(0)
clf = Ridge(alpha=1.0)
clf.fit(X_train, y_train) 
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, solver='auto', tol=0.009)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.009)

In [28]:
r2_score(y_test,clf.predict(X_test))

0.61503430600614295

In [29]:
mean_squared_error(y_test, clf.predict(X_test))

29.008572533724553