In [4]:
import pandas as pd
from sklearn.datasets import load_boston
dataset = load_boston()

#set dataframe
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns = ['MEDV'])

# check the shape
print('----------------------------------------------------------------------------------------')
print('X shape: (%i,%i)' %X.shape)
print('y shape: (%i,%i)' %y.shape)
print('----------------------------------------------------------------------------------------')
print(y.describe())
print('----------------------------------------------------------------------------------------')
print(X.join(y).head())
print('----------------------------------------------------------------------------------------')
print(dataset.DESCR)

----------------------------------------------------------------------------------------
X shape: (506,13)
y shape: (506,1)
----------------------------------------------------------------------------------------
             MEDV
count  506.000000
mean    22.532806
std      9.197104
min      5.000000
25%     17.025000
50%     21.200000
75%     25.000000
max     50.000000
----------------------------------------------------------------------------------------
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21

In [6]:
from sklearn.ensemble import GradientBoostingRegressor

est = GradientBoostingRegressor(max_depth=3, random_state=42)
est.fit(X,y.as_matrix().ravel())

from sklearn.metrics import r2_score
y_true = y.as_matrix().ravel()
y_pred = est.predict(X)
r2 = r2_score(y_true, y_pred)
print('R2 score of the descripitive model:%.3f' %r2)


R2 score of the descripitive model:0.975


In [11]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

# 交差検証のためデータを訓練とテストに分割
# 訓練を80%, テストを20%に分割 (できてない。するなら、test_size=0.20)
# 訓練とテストにランダム分割するだけの単純な交差検証はhold-outと呼ばれる

X_train, X_test, y_train, y_test = train_test_split(X,y)

# 比較用に二つのパラメータ違いのモデルを構築
# standard tree model
est1 = GradientBoostingRegressor(max_depth=3, random_state=42)
est1.fit(X_train,y_train.as_matrix().ravel())
#deeper tree model
est2 = GradientBoostingRegressor(max_depth=10,random_state=42)
est2.fit(X_train,y_train.as_matrix().ravel())

# モデルパフォーマンス指標(R2とする)を取得
from sklearn.metrics import r2_score
# for training data
r2_est1_train = r2_score(y_train.as_matrix().ravel(), est1.predict(X_train))
r2_est2_train = r2_score(y_train.as_matrix().ravel(), est2.predict(X_train))
#for test data
r2_est1_test = r2_score(y_test.as_matrix().ravel(), est1.predict(X_test))
r2_est2_test = r2_score(y_test.as_matrix().ravel(), est2.predict(X_test))


# 性能指標の表示
# 以下のスコアをどのように評価すべきか？ --> Keyword: overfitting, train test gap
print('-----------------------------------------------------')
print('Train Score(est1, est2) : (%.3f, %.3f)' % (r2_est1_train, r2_est2_train))
print('Test Score(est1, est2) : (%.3f, %.3f)' % (r2_est1_test, r2_est2_test))


#est1のパタメータ条件で最終モデルを構築
est1.fit(X,y.as_matrix().ravel())
print('-----------------------------------------------------')
print(est1)

#過学習モデルも参考のため構築
est2.fit(X,y.as_matrix().ravel())
print('-----------------------------------------------------')
print(est2)



-----------------------------------------------------
Train Score(est1, est2) : (0.981, 1.000)
Test Score(est1, est2) : (0.852, 0.718)
-----------------------------------------------------
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False)
-----------------------------------------------------
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=10, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, m