In [22]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [8]:
boston = load_boston()
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [6]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'],
      dtype='<U7')

In [9]:
y = boston.target
X = boston.data
X.shape

(506, 13)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
model = GradientBoostingRegressor().fit(X_train, y_train)

In [15]:
model.score(X_test, y_test)

0.89444807909619295

In [16]:
model.feature_importances_

array([ 0.0815703 ,  0.00763592,  0.04368293,  0.0037814 ,  0.0433809 ,
        0.19474888,  0.10601505,  0.13607367,  0.01964478,  0.07849985,
        0.05523443,  0.06247165,  0.16726024])

In [17]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'],
      dtype='<U7')

In [18]:
y

array([ 24. ,  21.6,  34.7,  33.4,  36.2,  28.7,  22.9,  27.1,  16.5,
        18.9,  15. ,  18.9,  21.7,  20.4,  18.2,  19.9,  23.1,  17.5,
        20.2,  18.2,  13.6,  19.6,  15.2,  14.5,  15.6,  13.9,  16.6,
        14.8,  18.4,  21. ,  12.7,  14.5,  13.2,  13.1,  13.5,  18.9,
        20. ,  21. ,  24.7,  30.8,  34.9,  26.6,  25.3,  24.7,  21.2,
        19.3,  20. ,  16.6,  14.4,  19.4,  19.7,  20.5,  25. ,  23.4,
        18.9,  35.4,  24.7,  31.6,  23.3,  19.6,  18.7,  16. ,  22.2,
        25. ,  33. ,  23.5,  19.4,  22. ,  17.4,  20.9,  24.2,  21.7,
        22.8,  23.4,  24.1,  21.4,  20. ,  20.8,  21.2,  20.3,  28. ,
        23.9,  24.8,  22.9,  23.9,  26.6,  22.5,  22.2,  23.6,  28.7,
        22.6,  22. ,  22.9,  25. ,  20.6,  28.4,  21.4,  38.7,  43.8,
        33.2,  27.5,  26.5,  18.6,  19.3,  20.1,  19.5,  19.5,  20.4,
        19.8,  19.4,  21.7,  22.8,  18.8,  18.7,  18.5,  18.3,  21.2,
        19.2,  20.4,  19.3,  22. ,  20.3,  20.5,  17.3,  18.8,  21.4,
        15.7,  16.2,

In [19]:
# run cross validation with KFold and search through a few hyper-parameters to get a better r2 value.

In [34]:
cv = KFold(n_splits=10)
for estimators in range(10, 1000, 100):
    for depth in range(1, 4):
        model = GradientBoostingRegressor().fit(X_train, y_train)
        scores = []
        for train_i, test_i in cv.split(X):
            #Xr, yr, Xt, yt = X.loc[train_i], y.loc[train_i], X.loc[test_i], y.loc[test_i]
            Xr, yr, Xt, yt = X[train_i], y[train_i], X[test_i], y[test_i]
            model.fit(Xr, yr)
            scores.append(model.score(Xt, yt))
        print('depth:', depth,'estimators:', estimators, 'scores:', sum(scores)/len(scores), 'min: ', min(scores), 'max:',max(scores))

depth: 1 estimators: 10 scores: 0.458653806292 min:  -0.757679723631 max: 0.8392746127
depth: 2 estimators: 10 scores: 0.467501690464 min:  -0.74306712459 max: 0.842592230356
depth: 3 estimators: 10 scores: 0.474161658266 min:  -0.654573666676 max: 0.841465341979
depth: 1 estimators: 110 scores: 0.464339122713 min:  -0.654938714391 max: 0.839687899546
depth: 2 estimators: 110 scores: 0.462977877928 min:  -0.712432845251 max: 0.841331087564
depth: 3 estimators: 110 scores: 0.467775003414 min:  -0.726840575029 max: 0.840776271859
depth: 1 estimators: 210 scores: 0.460631560062 min:  -0.735831716291 max: 0.842254367072
depth: 2 estimators: 210 scores: 0.472551892288 min:  -0.652379392179 max: 0.840253103568
depth: 3 estimators: 210 scores: 0.472499023756 min:  -0.627231525745 max: 0.832456716532
depth: 1 estimators: 310 scores: 0.465965560674 min:  -0.749962104401 max: 0.840831449601
depth: 2 estimators: 310 scores: 0.458096264871 min:  -0.726222177516 max: 0.841155760773
depth: 3 estimat

In [30]:
'''
cv = KFold(n_splits=5)
for estimators in range(100, 1000, 100):
    for depth in range(1, 4):
        model = GradientBoostingRegressor(n_estimators=estimators, max_depth=depth)
        scores = []
        for train_i, test_i in cv.split(X):
            Xr, yr, Xt, yt = X[train_i], y[train_i], X[test_i], y[test_i]
            model.fit(Xr, yr)
            scores.append(model.score(Xt, yt))
        print('depth:', depth, 'estimators:', estimators, 'scores:', sum(scores)/len(scores), 'min:', min(scores), 'max:', max(scores))
'''        

"\ncv = KFold(n_splits=5)\nfor estimators in range(100, 1000, 100):\n    for depth in range(1, 4):\n        model = GradientBoostingRegressor(n_estimators=estimators, max_depth=depth)\n        scores = []\n        for train_i, test_i in cv.split(X):\n            Xr, yr, Xt, yt = X[train_i], y[train_i], X[test_i], y[test_i]\n            model.fit(Xr, yr)\n            scores.append(model.score(Xt, yt))\n        print('depth:', depth, 'estimators:', estimators, 'scores:', sum(scores)/len(scores), 'min:', min(scores), 'max:', max(scores))\n"