### Load housing data

In [1]:
import pandas as pd

boston_housing = pd.read_csv('data/boston_housing.data',sep=' ', header=None)

In [2]:
boston_housing[13] = boston_housing[13]*1000
boston_housing.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.10574,0.0,27.74,0,0.609,5.983,98.8,1.8681,4,711.0,20.1,390.11,18.07,13600.0
1,7.75223,0.0,18.1,0,0.713,6.301,83.7,2.7831,24,666.0,20.2,272.21,16.23,14900.0
2,0.02763,75.0,2.95,0,0.428,6.595,21.8,5.4011,3,252.0,18.3,395.63,4.32,30800.0
3,0.09266,34.0,6.09,0,0.433,6.495,18.4,5.4917,7,329.0,16.1,383.61,8.67,26400.0
4,15.1772,0.0,18.1,0,0.74,6.152,100.0,1.9142,24,666.0,20.2,9.32,26.45,8700.0


### Generate training and test data

In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set= train_test_split(boston_housing, test_size=0.3, random_state=42)

In [4]:
x_train = train_set.drop(13, axis=1)
y_train = train_set[13].copy()

x_test = test_set.drop(13, axis=1)
y_test = test_set[13].copy()

x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
5,0.37578,0.0,10.59,1,0.489,5.404,88.6,3.665,4,277.0,18.6,395.24,23.98
116,4.26131,0.0,18.1,0,0.77,6.112,81.3,2.5091,24,666.0,20.2,390.74,12.67
45,0.52693,0.0,6.2,0,0.504,8.725,83.0,2.8944,8,307.0,17.4,382.0,4.63
16,4.89822,0.0,18.1,0,0.631,4.97,100.0,1.3325,24,666.0,20.2,375.52,3.26
468,0.03427,0.0,5.19,0,0.515,5.869,46.3,5.2311,5,224.0,20.2,396.9,9.8


### Prep training and test data

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


In [6]:
null_rows = x_train[x_train.isnull().any(axis=1)]
if null_rows.empty:
    print("No null rows found !")

No null rows found !


In [7]:
pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X_train = pipeline.fit_transform(x_train)
X_test = pipeline.fit_transform(x_test)

In [8]:
print(x_train.shape)
print(X_train.shape)

print(x_test.shape)
print(X_test.shape)

(354, 13)
(354, 13)
(152, 13)
(152, 13)


### Training and Evaluation

In [9]:
def print_kfold_scores(model, model_name):
    
    temp_scores = cross_val_score(model, X_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)
    scores = np.sqrt(-temp_scores)
    print("name: ", model_name)
    print("rmse: ", scores)
    print("mean: ", scores.mean())
    print("sd: ", scores.std())
    
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    test_rmse = np.sqrt(mse)
    print("rmse_test: ", test_rmse)
    

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np


In [11]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

print_kfold_scores(linear_reg, "LinearRegressor")
print("-------------------------------------------------------------------------------")


name:  LinearRegressor
rmse:  [5950.35993865 5377.69287816 4064.73345501 3410.70946942 4316.39174414
 3159.04766876 3238.7067375  4855.283929   5184.95887495 5771.99688189]
mean:  4532.988157747077
sd:  993.7361127648367
rmse_test:  5516.94179253281
-------------------------------------------------------------------------------


In [12]:
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train, y_train)

print_kfold_scores(tree_reg, "DecisionTreeRegressor")
print("-------------------------------------------------------------------------------")


name:  DecisionTreeRegressor
rmse:  [4048.42218923 5556.62767433 4045.12738434 6892.68936386 4573.99481541
 3491.49988237 3168.41555716 4926.86512907 3453.32138259 6264.61719273]
mean:  4642.15805710785
sd:  1193.2080945734692
rmse_test:  4485.070262900894
-------------------------------------------------------------------------------


In [13]:
forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(X_train, y_train)

print_kfold_scores(forest_reg, "RandomForestRegressor")
print("-------------------------------------------------------------------------------")


name:  RandomForestRegressor
rmse:  [4090.84479186 4930.56341157 3997.03292734 2958.47041936 3794.68575774
 3218.06419185 2510.9167364  2765.7615846  3176.90235472 4759.47476094]
mean:  3620.2716936384195
sd:  785.2159236669186
rmse_test:  4092.4367821371993
-------------------------------------------------------------------------------


In [14]:
from sklearn.svm import SVR

svm_reg = SVR(gamma=0.01, kernel="rbf")
svm_reg.fit(X_train, y_train)

print_kfold_scores(svm_reg, "SupportVectorRegressor")
print("-------------------------------------------------------------------------------")


name:  SupportVectorRegressor
rmse:  [ 9376.59267308  8882.71035611  8469.06707871  7542.6154643
  9567.50561198  8812.31615021  4699.83454408  8909.1512411
 10775.36600733  9672.34562477]
mean:  8670.75047516624
sd:  1547.2014493689721
rmse_test:  10504.014695763755
-------------------------------------------------------------------------------


### GridSearch 

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
grid = [{'n_estimators': [10, 20, 30, 40], 'max_features': [4, 8]}]

forest_reg_adv = RandomForestRegressor()
search = GridSearchCV(forest_reg_adv, grid, cv=10, scoring='neg_mean_squared_error')

search.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [4, 8],
     

In [17]:
best_params = search.best_params_
print('best hyperparams in search : ', best_params)

best hyperparams in search :  {'max_features': 8, 'n_estimators': 20}


In [18]:
search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=8, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [19]:
search_results = search.cv_results_

params = search_results['params']
mean_scores = search_results['mean_test_score']
for i in range(len(params)):
    mean_score = mean_scores[i]
    param = params[i]
    print('mse: ', np.sqrt(-mean_score), 'params: ', param)


mse:  3611.8161777276146 params:  {'max_features': 4, 'n_estimators': 10}
mse:  3571.9339505221337 params:  {'max_features': 4, 'n_estimators': 20}
mse:  3497.02776676231 params:  {'max_features': 4, 'n_estimators': 30}
mse:  3422.970975134053 params:  {'max_features': 4, 'n_estimators': 40}
mse:  3416.799969839953 params:  {'max_features': 8, 'n_estimators': 10}
mse:  3411.9457187315597 params:  {'max_features': 8, 'n_estimators': 20}
mse:  3442.1724798507785 params:  {'max_features': 8, 'n_estimators': 30}
mse:  3438.4779656614032 params:  {'max_features': 8, 'n_estimators': 40}


In [20]:
final_model = search.best_estimator_

In [21]:
predictions_with_grid_search = final_model.predict(X_test)
mse = mean_squared_error(y_test, predictions_with_grid_search)
test_rmse = np.sqrt(mse)
print("rmse_test: ", test_rmse)

rmse_test:  3761.810283871597
