# Decision Tree Regression

In [40]:
# the code from the last work book
from sklearn.datasets import load_diabetes
from sklearn.tree import DecisionTreeRegressor as DTR

import pandas as pd
import numpy as np

# import the data
data = load_diabetes()
data

{'data': array([[ 0.04,  0.05,  0.06, ..., -0.  ,  0.02, -0.02],
        [-0.  , -0.04, -0.05, ..., -0.04, -0.07, -0.09],
        [ 0.09,  0.05,  0.04, ..., -0.  ,  0.  , -0.03],
        ...,
        [ 0.04,  0.05, -0.02, ..., -0.01, -0.05,  0.02],
        [-0.05, -0.04,  0.04, ...,  0.03,  0.04, -0.03],
        [-0.05, -0.04, -0.07, ..., -0.04, -0.  ,  0.  ]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
        128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
        150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
        200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
         42., 111.,  98., 164.,  

In [41]:
# You'll need these for hyperparameter optimisation

scores = ['neg_mean_squared_error', 'r2']
{'criterion': ['squared_error', 'absolute_error']}

{'criterion': ['squared_error', 'absolute_error']}

neg_mean_squared_error:
在sklearn当中，我们有两种方式调用这个评估指标，一种是使用sklearn专用的模型评估模块metrics里的类mean_squared_error，另一种是调用交叉验证的类cross_val_score并使用里面的scoring参数来设置使用均方误差。cross_val_score的均方误差为负。我们在决策树和随机森林中都提到过，虽然均方误差永远为正，但是sklearn中的参数scoring下，均方误差作为评判标准时，却是计算”负均方误差“（neg_mean_squared_error）。这是因为sklearn在计算模型评估指标的时候，会考虑指标本身的性质，均方误差本身是一种误差，所以被sklearn划分为模型的一种损失(loss)。在sklearn当中，所有的损失都使用负数表示，因此均方误差也被显示为负数了。真正的均方误差MSE的数值，其实就是neg_mean_squared_error去掉负号的数字。

In [42]:
# Your code here
from sklearn.datasets import load_breast_cancer
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier as DTC
import pandas as pd
import numpy as np

In [43]:
#4. Separate the target (𝒀) from the features (𝑿s)


In [58]:
# create a DataFrame of features 
diabetes_df = pd.DataFrame(data.data, columns=data.feature_names)

# normalise the data
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
df = scaler.fit_transform(diabetes_df)
diabetes_df = pd.DataFrame(df, columns=data.feature_names)

# create a target variable
diabetes_target = data.target

In [59]:
#5. Split the data into training and test


In [73]:
from sklearn.model_selection  import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(diabetes_df, diabetes_target, test_size = 0.2)


In [74]:
#6. Run the model on the training data


In [79]:
# fit a decision tree to the training data
tree_model = DTC()
tree_model_fit = tree_model.fit(X_train, Y_train)

In [80]:
#7. Check the performance metrics / score
# scores = ['neg_mean_squared_error', 'r2']

In [109]:
# predict every Y value in the test data
predicted = tree_model_fit.predict(X_test)

# calculate RMSE (root mean square error) and R^2 (predictive power)
from sklearn.metrics import mean_squared_error, r2_score
rmse = (np.sqrt(mean_squared_error(Y_test, predicted)))
r2 = r2_score(Y_test, predicted)

# print the performance metrics
print("Model performance")
print("--------------------------------------")
print(f'rmse is {rmse}')
print(f'r2 score is {r2}')

Model performance
--------------------------------------
rmse is 83.43126081331191
r2 score is -0.18877815161185318


In [110]:
#8. Optimise hyperparameters


In [111]:
from sklearn.model_selection import GridSearchCV

tuned_parameters = [{'criterion': ['squared_error', 'absolute_error'],
                     'max_depth': [3, 5, 7],
                     'min_samples_split': [3, 5, 7],
                     'max_features': ["sqrt", "log2", None]}]

scores = ['rmse', 'r2']

scores

['rmse', 'r2']

In [None]:
for score in scores:
    print("# Tuning hyperparameters for %s" % score)
    print("\n")
    clf = GridSearchCV(DTC(), tuned_parameters, cv=5,
                       scoring= score)
    clf.fit(X_train, Y_train)
    print("Best parameters set found on the training set:")
    print(clf.best_params_)
    print("\n")

In [99]:
#9. Check the performance metrics / score again
