In [28]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import numpy as np

In [29]:
boston = load_boston()
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [30]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'],
      dtype='<U7')

In [31]:
y = boston.target
X = boston.data
X.shape

(506, 13)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [33]:
# run cross validation with KFold and search through a few hyper-parameters to get a better r2 value.
model = GradientBoostingRegressor().fit(X_train, y_train)

In [34]:
model.score(X_test, y_test)

0.8951385695511348

In [35]:
model.feature_importances_

array([ 0.08051985,  0.00291911,  0.0468287 ,  0.00498263,  0.04201283,
        0.20162251,  0.10572856,  0.13046557,  0.02030772,  0.07321047,
        0.05522933,  0.06367674,  0.17249599])

In [36]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'],
      dtype='<U7')

In [52]:
cv = KFold(n_splits=3)
for estimators in range(100, 1000, 100):
    for depth in range(1, 4):
        model = GradientBoostingRegressor(n_estimators=estimators, max_depth=depth)
        scores = []
        for train_i, test_i in cv.split(X):
            Xr, yr, Xt, yt = X[train_i], y[train_i], X[test_i], y[test_i]
            model.fit(Xr, yr)
            scores.append(model.score(Xt, yt))
        print('depth:', depth, 'estimators:', estimators, 'scores:', sum(scores)/len(scores), 'min:', min(scores), 'max:', max(scores))

depth: 1 estimators: 100 scores: 0.526570364061 min: 0.211776889498 max: 0.848306590675
depth: 2 estimators: 100 scores: 0.578030789459 min: 0.228141067528 max: 0.831927105549
depth: 3 estimators: 100 scores: 0.579169589295 min: 0.269419992912 max: 0.831169114272
depth: 1 estimators: 200 scores: 0.546699038169 min: 0.24886843611 max: 0.831402118158
depth: 2 estimators: 200 scores: 0.576409687693 min: 0.216313002708 max: 0.826053360508
depth: 3 estimators: 200 scores: 0.561614941321 min: 0.253606743986 max: 0.798512678834
depth: 1 estimators: 300 scores: 0.544126976099 min: 0.250374222591 max: 0.803719640266
depth: 2 estimators: 300 scores: 0.569506672404 min: 0.207285299927 max: 0.811242530235
depth: 3 estimators: 300 scores: 0.558393784894 min: 0.249081589141 max: 0.79911491835
depth: 1 estimators: 400 scores: 0.540777328167 min: 0.245571411797 max: 0.790152735152
depth: 2 estimators: 400 scores: 0.570350975795 min: 0.205233171361 max: 0.807516584669
depth: 3 estimators: 400 scores: 0

In [51]:
X.shape

(506, 13)