In [27]:
from sklearn import datasets, metrics

# 如果是分類問題，請使用 DecisionTreeClassifier，若為回歸問題，請使用 DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split

### 建立模型四步驟
在 Scikit-learn 中，建立一個機器學習的模型其實非常簡單，流程大略是以下四個步驟

讀進資料，並檢查資料的 shape (有多少 samples (rows), 多少 features (columns)，label 的型態是什麼？)
讀取資料的方法：
使用 pandas 讀取 .csv 檔：pd.read_csv
使用 numpy 讀取 .txt 檔：np.loadtxt
使用 Scikit-learn 內建的資料集：sklearn.datasets.load_xxx
檢查資料數量：data.shape (data should be np.array or dataframe)
將資料切為訓練 (train) / 測試 (test)
train_test_split(data)
建立模型，將資料 fit 進模型開始訓練
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
將測試資料 (features) 放進訓練好的模型中，得到 prediction，與測試資料的 label (y_test) 做評估
clf.predict(x_test)
accuracy_score(y_test, y_pred)
f1_score(y_test, y_pred)

In [28]:
# 讀取鳶尾花資料集
iris = datasets.load_iris()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=4)

# 建立模型
clf = DecisionTreeClassifier(criterion = 'entropy')

# 訓練模型
clf.fit(x_train, y_train)

# 預測測試集
y_pred = clf.predict(x_test)

In [29]:
acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.9736842105263158


In [30]:
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [31]:
print("Feature importance: ", clf.feature_importances_)

Feature importance:  [0.0156062  0.         0.62264163 0.36175217]


### 作業
試著調整 DecisionTreeClassifier(...) 中的參數，並觀察是否會改變結果？
改用其他資料集 (boston, wine)，並與回歸模型的結果進行比較

In [33]:
from sklearn import datasets, metrics
import pandas as pd
# 如果是分類問題，請使用 DecisionTreeClassifier，若為回歸問題，請使用 DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [34]:

wine=datasets.load_wine()
wine_x=pd.DataFrame(wine.data, columns=wine.feature_names)
wine_y=wine.target
train_x, test_x, train_y, test_y= train_test_split(wine_x, wine_y, test_size=0.3)
CART=DecisionTreeClassifier(criterion='gini', 
                            max_depth=None, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            )
CART.fit(train_x, train_y)
hat_y=CART.predict(test_x)
accuracy=metrics.accuracy_score(hat_y, test_y)
feature_importance=CART.feature_importances_
feature_importance=pd.DataFrame({'feature_name':wine.feature_names,
                                 'feature_import':feature_importance})
print('accuracy={:.3f}'.format(accuracy))
print('==============================================================================')
print(feature_importance)

accuracy=0.944
                    feature_name  feature_import
0                        alcohol        0.007717
1                     malic_acid        0.016537
2                            ash        0.000000
3              alcalinity_of_ash        0.000000
4                      magnesium        0.000000
5                  total_phenols        0.000000
6                     flavanoids        0.197876
7           nonflavanoid_phenols        0.000000
8                proanthocyanins        0.024241
9                color_intensity        0.000000
10                           hue        0.060749
11  od280/od315_of_diluted_wines        0.283395
12                       proline        0.409485
