In [65]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

In [18]:
print(load_boston().DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [19]:
X = pd.DataFrame(load_boston()['data'], columns = load_boston()['feature_names'])
y = pd.DataFrame(load_boston()['target'],columns = ['MEDV'])
Xy = pd.concat([X, y], axis=1)

In [20]:
display(X.head())
display(y.head())
print(X.shape, y.shape)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


(506, 13) (506, 1)


In [21]:
display(X.info())
display(X.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB


None

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [22]:
print(X.isnull().sum(), '\n')

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
dtype: int64 



### Divide dataset into train and validation (test) samples

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [25]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(404, 13) (102, 13) (404, 1) (102, 1)


In [26]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### LinearRegression

In [30]:
lr = LinearRegression()

In [31]:
lr_params = {
    'fit_intercept': [False, True]
}

In [32]:
grid_lr = GridSearchCV(lr, lr_params, cv=10, scoring='r2')
grid_lr.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LinearRegression(copy_X=True, fit_intercept=True,
                                        n_jobs=None, normalize=False),
             iid='warn', n_jobs=None,
             param_grid={'fit_intercept': [False, True]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='r2', verbose=0)

In [33]:
print(grid_lr.best_params_)
print(grid_lr.best_score_)
print(grid_lr.best_estimator_)

{'fit_intercept': True}
0.7049331148173825
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)


### DecisionTreeRegressor

In [34]:
tree = DecisionTreeRegressor()

In [48]:
tree_params = {
    'max_depth': list(range(1, 11)),
    'min_samples_split': [0.1, 0.2, 0.5, 1.0],
    'criterion': ['mse', 'mae']
}

In [49]:
grid_tree = GridSearchCV(tree, tree_params, cv=10, scoring='r2')
grid_tree.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['mse', 'mae'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [0.1, 0.2, 0.5, 1.0]},
             pre_

In [50]:
print(grid_tree.best_params_)
print(grid_tree.best_score_)
print(grid_tree.best_estimator_)

{'criterion': 'mse', 'max_depth': 10, 'min_samples_split': 0.1}
0.7543606899016064
DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.1, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')


### KNeighborsRegressor

In [29]:
knn = KNeighborsRegressor()

In [54]:
knn_params = {
    'n_neighbors':list(range(1, 20)),
    'weights': ['uniform', 'distance'],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size':list(range(1, 20))
}

In [55]:
grid_knn = GridSearchCV(knn, knn_params, cv=10, scoring='r2')
grid_knn.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'leaf_size': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='r2', verbose=0)

In [56]:
print(grid_knn.best_params_)
print(grid_knn.best_score_)
print(grid_knn.best_estimator_)

{'algorithm': 'brute', 'leaf_size': 1, 'n_neighbors': 11, 'weights': 'distance'}
0.7373836764305484
KNeighborsRegressor(algorithm='brute', leaf_size=1, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                    weights='distance')


### RandomForestRegressor

In [57]:
forest = RandomForestRegressor()

In [60]:
forest_params = {
    'n_estimators': [5, 10, 20, 30, 40],
    'max_depth': [None, 1, 2, 5, 10, 25],
    'min_samples_leaf': list(range(1, 10)),
    'max_features': list(range(1, 10)),
    'criterion': ['mse', 'friedman_mse', 'mae']
}

In [61]:
grid_forest = GridSearchCV(forest, forest_params, cv=10, scoring='r2')
grid_forest.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['mse', 'friedman

In [62]:
print(grid_forest.best_params_)
print(grid_forest.best_score_)
print(grid_forest.best_estimator_)

{'criterion': 'mae', 'max_depth': None, 'max_features': 7, 'min_samples_leaf': 1, 'n_estimators': 40}
0.8614062253775506
RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
                      max_features=7, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=40,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)


### GradientBoostingRegressor

In [66]:
gboost = GradientBoostingRegressor()

In [67]:
gboost_params = {
    'n_estimators': [5, 10, 20, 30, 40],
    'max_features': list(range(1, 10)),
    'max_depth': [None ,1, 2, 5, 10, 25],
    'learning_rate': [0.1, 0.3, 0.5, 0.7],
    'min_samples_leaf': list(range(1, 10))
}

In [68]:
grid_gboost = GridSearchCV(gboost, gboost_params, cv=10, scoring='r2')
grid_gboost.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=GradientBoostingRegressor(alpha=0.9,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_ite...
                             

In [69]:
print(grid_gboost.best_params_)
print(grid_gboost.best_score_)
print(grid_gboost.best_estimator_)

{'learning_rate': 0.3, 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 3, 'n_estimators': 20}
0.8700005297803008
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.3, loss='ls', max_depth=None,
                          max_features=5, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=3, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=20,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)


### Compare the results on the validation sample

In [72]:
estimators = {
    'GradientBoostingRegressor': grid_gboost,
    'RandomForestRegressor': grid_forest,
    'DecisionTreeRegressor': grid_tree,
    'KNeighborsRegressor': grid_knn,
    'LinearRegression': grid_lr
}

In [77]:
for key in estimators:
    value = estimators[key]
    print(
    '{0}:\n    {1}{2}\n    {3}{4}\n\n'.format(
        key, 
        "CV R^2: ", value.best_score_, 
        "Validation R^2: ", value.best_estimator_.score(X_test, y_test)
    )
)

GradientBoostingRegressor:
    CV R^2: 0.8700005297803008
    Validation R^2: 0.8672753403783692


RandomForestRegressor:
    CV R^2: 0.8614062253775506
    Validation R^2: 0.8993370718687299


DecisionTreeRegressor:
    CV R^2: 0.7543606899016064
    Validation R^2: 0.8097400538461644


KNeighborsRegressor:
    CV R^2: 0.7373836764305484
    Validation R^2: 0.7563600496020787


LinearRegression:
    CV R^2: 0.7049331148173825
    Validation R^2: 0.7030460849621956




In [None]:
pass