In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [None]:
data_url = "https://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

In [None]:
x = data
y = target
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=233)

In [None]:
from sklearn.tree import DecisionTreeRegressor
reg = DecisionTreeRegressor()
reg.fit(x_train, y_train)

In [None]:
reg.score(x_test, y_test), reg.score(x_train, y_train)

### 绘制学习曲线

In [None]:
from sklearn.metrics import r2_score
plt.rcParams['figure.figsize'] = (12, 8)

max_depth = [2, 5, 10, 20]

for i in range(len(max_depth)):
    reg = DecisionTreeRegressor(max_depth=max_depth[i])
    train_error, test_error = [], []
    for k in range(len(x_train)):
        reg.fit(x_train[:k+1], y_train[:k+1])
        y_train_pred = reg.predict(x_train[:k+1])
        train_error.append(r2_score(y_train[:k+1], y_train_pred))
        y_test_pred = reg.predict(x_test)
        test_error.append(r2_score(y_test, y_test_pred))
        
    plt.subplot(2, 2, i+1)
    plt.ylim(0, 1.1)
    plt.title("Depth=%d" % max_depth[i])
    plt.plot(range(1, len(x_train)+1), train_error, label="train", color='r')
    plt.plot(range(1, len(x_train)+1), test_error, label="test", color='b')
    plt.legend()
plt.show()

### 网格搜索

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    "max_depth": [n for n in range(2, 15)],
    "min_samples_leaf": [n for n in range(3, 20)]
}

In [None]:
grid = GridSearchCV(DecisionTreeRegressor(), params, n_jobs=-1, cv=5)

In [None]:
grid.fit(x_train, y_train)

In [None]:
grid.best_params_, grid.best_score_

In [None]:
reg = grid.best_estimator_
reg.score(x_test, y_test), reg.score(x_train, y_train)