# Load data and probe data stats

In [2]:
from sklearn.datasets import load_boston
import numpy as np
boston = load_boston()
X = boston.data
y = boston.target

In [6]:
import matplotlib.pyplot as plt
import numpy as np

number_of_houses = X.shape[0]
number_of_features = X.shape[1]
max_price = np.max(y)
min_price = np.min(y)
mean_price = np.mean(y)
median_price = np.median(y)
standard_deviation = np.std(y)
print "number of houses:",number_of_houses
print "number of features:",number_of_features
print "max price of house:",max_price
print "min price of house:",min_price
print "mean price of house:",mean_price
print "median price of house:",median_price
print "standard deviation for prices of house:",standard_deviation

number of houses: 506
number of features: 13
max price of house: 50.0
min price of house: 5.0
mean price of house: 22.5328063241
median price of house: 21.2
standard deviation for prices of house: 9.18801154528


In [29]:
print("features mean %s" % str(np.mean(X, axis = 0)))
print("features variance %s" % str(np.std(X, axis = 0)))

features mean [  3.59376071e+00   1.13636364e+01   1.11367787e+01   6.91699605e-02
   5.54695059e-01   6.28463439e+00   6.85749012e+01   3.79504269e+00
   9.54940711e+00   4.08237154e+02   1.84555336e+01   3.56674032e+02
   1.26530632e+01]
features variance [  8.58828355e+00   2.32993957e+01   6.85357058e+00   2.53742935e-01
   1.15763115e-01   7.01922514e-01   2.81210326e+01   2.10362836e+00
   8.69865112e+00   1.68370495e+02   2.16280519e+00   9.12046075e+01
   7.13400164e+00]


# Load boston data and split into train and test data

In [3]:
train_size = int(0.7 * y.size)
np.random.seed(0)
indices = np.random.permutation(y.size)

train_X = X[indices[:train_size]]
train_y = y[indices[:train_size]]
test_X = X[indices[train_size:]]
test_y = y[indices[train_size:]]

# Basic regression model
1. Linear regression
2. Ridge regression
3. Lasso regression

In [6]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [9]:
pred_y = np.full(test_y.shape, train_y.mean())
print("Baseline MSE: %.3f" % mean_squared_error(test_y, pred_y))

Baseline MSE: 99.330


In [10]:
reg = linear_model.LinearRegression()
pred_y = reg.fit(train_X, train_y).predict(test_X)
score = reg.score(test_X, test_y)
print("Linear regression, MSE: %.3f, score %.3f" % (mean_squared_error(test_y, pred_y), score))

Linear regression, MSE: 24.541, score 0.752


In [11]:
reg_cv = linear_model.RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0])
pred_y = reg_cv.fit(train_X, train_y).predict(test_X)
print("Ridge with alpha = %.3f, MSE: %.3f, score %.3f"
      % (reg_cv.alpha_, mean_squared_error(test_y, pred_y), reg_cv.score(test_X, test_y) ))

Ridge with alpha = 0.010, MSE: 24.546, score 0.752


In [12]:
lasso_cv = linear_model.LassoCV(alphas=[0.01, 0.1, 1.0, 10.0])
pred_y = lasso_cv.fit(train_X, train_y).predict(test_X)
print("Lasso with alpha = %.3f, MSE: %.3f, score: %.3f"
      % (reg_cv.alpha_, mean_squared_error(test_y, pred_y), lasso_cv.score(test_X, test_y) ))

Lasso with alpha = 0.010, MSE: 24.768, score: 0.750


In [13]:
br = linear_model.BayesianRidge()
pred_y = br.fit(train_X, train_y).predict(test_X)
print("BayesianRidge MSE: %.3f, score: %.3f"
      % (mean_squared_error(test_y, pred_y), br.score(test_X, test_y)))

BayesianRidge MSE: 26.217, score: 0.735


# Whiten features

In [18]:
from sklearn import preprocessing
X_scaled = preprocessing.scale(X)
train_X_scaled = X[indices[:train_size]]
test_X_scaled = X[indices[train_size:]]

In [27]:
print("Scaled features mean " + str(np.mean(X_scaled, axis=0)))
print("Scaled features variance " + str(np.std(X_scaled, axis=0)))

Scaled features mean [  6.34099712e-17  -6.34319123e-16  -2.68291099e-15   4.70199198e-16
   2.49032240e-15  -1.14523016e-14  -1.40785495e-15   9.21090169e-16
   5.44140929e-16  -8.86861950e-16  -9.20563581e-15   8.16310129e-15
  -3.37016317e-16]
Scaled features variance [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]


In [18]:
reg = linear_model.LinearRegression()
pred_y = reg.fit(train_X_scaled, train_y).predict(test_X_scaled)
score = reg.score(test_X_scaled, test_y)
print("Linear regression, MSE: %.3f, score %.3f" % (mean_squared_error(test_y, pred_y), score))

Linear regression, MSE: 24.541, score 0.752


In [19]:
reg_cv = linear_model.RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0])
pred_y = reg_cv.fit(train_X_scaled, train_y).predict(test_X_scaled)
print("Ridge with alpha = %.3f, MSE: %.3f, score %.3f"
      % (reg_cv.alpha_, mean_squared_error(test_y, pred_y), reg_cv.score(test_X_scaled, test_y) ))

Ridge with alpha = 0.010, MSE: 24.546, score 0.752


In [20]:
lasso_cv = linear_model.LassoCV(alphas=[0.01, 0.1, 1.0, 10.0])
pred_y = lasso_cv.fit(train_X_scaled, train_y).predict(test_X_scaled)
print("Lasso with alpha = %.3f, MSE: %.3f, score: %.3f"
      % (reg_cv.alpha_, mean_squared_error(test_y, pred_y), lasso_cv.score(test_X_scaled, test_y) ))

Lasso with alpha = 0.010, MSE: 24.768, score: 0.750


In [21]:
br = linear_model.BayesianRidge()
pred_y = br.fit(train_X_scaled, train_y).predict(test_X_scaled)
print("BayesianRidge MSE: %.3f, score: %.3f"
      % (mean_squared_error(test_y, pred_y), br.score(test_X_scaled, test_y)))

BayesianRidge MSE: 26.217, score: 0.735


# Decision Tree and Ensemble Learning

In [13]:
# Basic decision tree
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=None,random_state=0)
dt.fit(train_X, train_y)
pred_y = dt.predict(test_X)
print("Decision Tree with orig features MSE: %.2f" % mean_squared_error(test_y, pred_y))

Decision Tree MSE: 29.24


In [21]:
# Basic decision tree
from sklearn.tree import DecisionTreeRegressor
wdt = DecisionTreeRegressor(max_depth=None,random_state=0)
wdt.fit(train_X_scaled, train_y)
pred_y = dt.predict(test_X_scaled)
print("Decision Tree with whiten features MSE: %.2f" % mean_squared_error(test_y, pred_y))

Decision Tree with whiten features MSE: 29.24


In [47]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None),
                        n_estimators=3500,
                        random_state=np.random.RandomState(1))
ada.fit(train_X, train_y)
pred_y = ada.predict(test_X)
print("Ada boost decision tree with orig features MSE: %.2f" % mean_squared_error(test_y, pred_y))

Ada boost decision tree with orig features MSE: 11.12


In [48]:
ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None),
                        n_estimators=500,
                        random_state=np.random.RandomState(1))
ada.fit(train_X_scaled, train_y)
pred_y = ada.predict(test_X_scaled)
print("Ada boost decision tree with whiten features MSE: %.2f" % mean_squared_error(test_y, pred_y))

Ada boost decision tree with whiten features MSE: 11.11


In [50]:
from sklearn.ensemble import RandomForestRegressor
orf = RandomForestRegressor(n_estimators=500, criterion='mse', max_depth=None)
orf.fit(train_X, train_y)
pred_y = orf.predict(test_X)
print("Random forest with orig features MSE: %.2f" % mean_squared_error(test_y, pred_y))
wrf = RandomForestRegressor(n_estimators=500, criterion='mse', max_depth=None)
wrf.fit(train_X, train_y)
pred_y = wrf.predict(test_X)
print("Random forest with whiten features MSE: %.2f" % mean_squared_error(test_y, pred_y))

Random forest with orig features MSE: 15.12
Random forest with whiten features MSE: 14.80


In [56]:
from sklearn.ensemble import GradientBoostingRegressor
ogbr = GradientBoostingRegressor(n_estimators=500, learning_rate=0.1,
                                 max_depth=1, random_state=0, loss='ls')
ogbr.fit(train_X, train_y)
pred_y = ogbr.predict(test_X)
print("Gradient Boosting Regressor MSE: %.2f" % mean_squared_error(test_y, pred_y))

wgbr = GradientBoostingRegressor(n_estimators=500, learning_rate=0.1,
                                 max_depth=1, random_state=0, loss='ls')
wgbr.fit(train_X_scaled, train_y)
pred_y = wgbr.predict(test_X_scaled)
print("Gradient Boosting Regressor MSE: %.2f" % mean_squared_error(test_y, pred_y))

Gradient Boosting Regressor MSE: 14.02
Gradient Boosting Regressor MSE: 14.02


# SVM

In [None]:
from sklearn.