#### From HOML Chapter 2 – End-to-end Machine Learning project

*predict median house values in Californian districts, given a number of features from these districts.*


Load and train the traing data

In [25]:
import numpy as np
from pprint import pprint

In [26]:
housing = np.genfromtxt('datasets/housing/housing_train_array.csv', delimiter=',')
housing_target = np.genfromtxt('datasets/housing/housing_train_target_array.csv', delimiter=',')

In [27]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing, housing_target)

housing_predictions = lin_reg.predict(housing)
lin_mse = mean_squared_error(housing_target, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

69050.98178244587

In [28]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
tree_reg = DecisionTreeRegressor(random_state=42)

In [29]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [30]:
scores = cross_val_score(tree_reg, housing, housing_target,
                         scoring="neg_mean_squared_error", cv=10)
pprint(scores)
tree_rmse_scores = np.sqrt(-scores)
display_scores(tree_rmse_scores)

array([-4.48190321e+09, -4.50573195e+09, -5.05052157e+09, -4.80388628e+09,
       -4.75517703e+09, -5.71870935e+09, -4.46385263e+09, -4.90116341e+09,
       -4.81913280e+09, -4.79081291e+09])
Scores: [66947.01789833 67124.74914113 71067.02170485 69310.0734372
 68957.79164907 75622.14855177 66812.0694578  70008.30958407
 69419.97404584 69215.69846262]
Mean: 69448.48539327028
Standard deviation: 2448.8752390813447


In [31]:
scores = cross_val_score(lin_reg, housing, housing_target,
                         scoring="neg_mean_squared_error", cv=10)
pprint(scores)
tree_rmse_scores = np.sqrt(-scores)
display_scores(tree_rmse_scores)

array([-4.54955924e+09, -4.53326193e+09, -4.67334235e+09, -5.57111294e+09,
       -4.66688012e+09, -5.13065836e+09, -4.27207885e+09, -4.70206808e+09,
       -5.25279671e+09, -4.63734690e+09])
Scores: [67450.42057782 67329.50264436 68361.84864912 74639.88837894
 68314.56738182 71628.61410355 65361.14176205 68571.62738037
 72476.18028894 68098.06828865]
Mean: 69223.18594556303
Standard deviation: 2657.2683112776926


In [32]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
scores = cross_val_score(forest_reg, housing, housing_target,
                         scoring="neg_mean_squared_error", cv=10)
pprint(scores)
tree_rmse_scores = np.sqrt(-scores)
display_scores(tree_rmse_scores)

array([-2.53740638e+09, -2.41321837e+09, -2.73279971e+09, -2.82438628e+09,
       -2.73988141e+09, -3.18550876e+09, -2.59496291e+09, -2.86669855e+09,
       -2.86382169e+09, -2.70172232e+09])
Scores: [50372.67497764 49124.51899831 52276.18684545 53144.95538783
 52343.87653726 56440.31145704 50940.7784416  53541.5591025
 53514.68671985 51978.09457949]
Mean: 52367.764304695695
Standard deviation: 1912.5637580637838


In [33]:
from sklearn import tree
dt = tree.DecisionTreeClassifier()
scores = cross_val_score(dt, housing, housing_target,
                         scoring="neg_mean_squared_error", cv=10)
pprint(scores)
tree_rmse_scores = np.sqrt(-scores)
display_scores(tree_rmse_scores)



array([-7.93499878e+09, -6.55702812e+09, -6.48875821e+09, -6.39319795e+09,
       -5.63725437e+09, -5.67863938e+09, -5.91823947e+09, -7.59400257e+09,
       -7.99042598e+09, -7.47138275e+09])
Scores: [89078.61013341 80975.47853923 80552.82870733 79957.47590066
 75081.65135528 75356.74738498 76930.09472549 87143.57443433
 89389.18265453 86437.16070481]
Mean: 82090.28045400501
Standard deviation: 5251.53440092081


In [38]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)

In [None]:
scores = cross_val_score(clf, housing, housing_target,
                         scoring="neg_mean_squared_error", cv=10)
pprint(scores)
tree_rmse_scores = np.sqrt(-scores)
display_scores(tree_rmse_scores)

