## Decision Tree 
- 의사결정 규칙을 나무구조로 도표화하여 관심대상이 되는 집단을 몇 개의 소집단으로 분류하거나 특정 값을 예측하는데 활용되는 분석 방법
- 용어 
    - root node : 뿌리 노드라고 하며 레이블(y)을 의미 
    - intermediate node : 중간 노드라고 하며 분류되는 과정 
    - terminal node : 끝 노드라고 하며 마지막으로 분류되는 기준 
- 분석 결과를 직관적으로 도식화하여 볼 수 있으므로 어떻게 분류되는지 알 수 있음

In [1]:
import pandas as pd 
data = pd.read_csv("../Data/breast-cancer-wisconsin.csv")
X = data.iloc[:, 1:10]
y = data[['Class']]

from sklearn.model_selection import * 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 410)

from sklearn.preprocessing import * 
minmax = MinMaxScaler() 
minmax.fit(X_train)
X_scaled_train = minmax.transform(X_train)
X_scaled_test = minmax.transform(X_test)

In [2]:
import sklearn
help(sklearn)

Help on package sklearn:

NAME
    sklearn

DESCRIPTION
    Machine learning module for Python
    
    sklearn is a Python module integrating classical machine
    learning algorithms in the tightly-knit world of scientific Python
    packages (numpy, scipy, matplotlib).
    
    It aims to provide simple and efficient solutions to learning problems
    that are accessible to everybody and reusable in various contexts:
    machine-learning as a versatile tool for science and engineering.
    
    See http://scikit-learn.org for complete documentation.

PACKAGE CONTENTS
    __check_build (package)
    _build_utils (package)
    _config
    _distributor_init
    _isotonic
    _loss (package)
    _min_dependencies
    base
    calibration
    cluster (package)
    compose (package)
    conftest
    covariance (package)
    cross_decomposition (package)
    datasets (package)
    decomposition (package)
    discriminant_analysis
    dummy
    ensemble (package)
    exceptions
    experime

In [3]:
from sklearn.tree import * 
dir(sklearn.tree)

['BaseDecisionTree',
 'DecisionTreeClassifier',
 'DecisionTreeRegressor',
 'ExtraTreeClassifier',
 'ExtraTreeRegressor',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_classes',
 '_criterion',
 '_export',
 '_reingold_tilford',
 '_splitter',
 '_tree',
 '_utils',
 'export_graphviz',
 'export_text',
 'plot_tree']

In [4]:
model = DecisionTreeClassifier()
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

1.0

In [6]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.9766081871345029

In [9]:
from sklearn.metrics import * 
con_train = confusion_matrix(y_train, pred_train)
print(con_train)

[[333   0]
 [  0 179]]


In [10]:
report_train = classification_report(y_train, pred_train)
print(report_train)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       333
           1       1.00      1.00      1.00       179

    accuracy                           1.00       512
   macro avg       1.00      1.00      1.00       512
weighted avg       1.00      1.00      1.00       512



In [12]:
con_test = confusion_matrix(y_test, pred_test)
print(con_test)

[[108   3]
 [  1  59]]


In [13]:
report_test = classification_report(y_test, pred_test)
print(report_test)

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       111
           1       0.95      0.98      0.97        60

    accuracy                           0.98       171
   macro avg       0.97      0.98      0.97       171
weighted avg       0.98      0.98      0.98       171



In [14]:
param_grid = {'max_depth' : range(2, 20, 2), "min_samples_leaf" : range(1, 50, 2)}
gridSearch = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)
gridSearch.fit(X_scaled_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(2, 20, 2),
                         'min_samples_leaf': range(1, 50, 2)})

In [15]:
print("Best param : {}".format(gridSearch.best_params_))
print("Best score : {}".format(gridSearch.best_score_))
print("test score : {}".format(gridSearch.score(X_scaled_test, y_test)))

Best param : {'max_depth': 4, 'min_samples_leaf': 5}
Best score : 0.9531505806205978
test score : 0.9649122807017544


In [16]:
from scipy.stats import randint 
param_dist = {
    "max_depth" : randint(low = 1, high = 20),
    "min_samples_leaf" : randint(low=1, high=50)}

randomSearch = RandomizedSearchCV(DecisionTreeClassifier(), param_dist, n_iter=100, cv = 5)
randomSearch.fit(X_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_iter=100,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x14057a0a0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x14056e430>})

In [17]:
print("Best Param : {}".format(randomSearch.best_params_))
print("Best score : {}".format(randomSearch.best_score_))
print("Test score : {}".format(randomSearch.score(X_scaled_test, y_test)))

Best Param : {'max_depth': 18, 'min_samples_leaf': 6}
Best score : 0.9550923281934134
Test score : 0.9707602339181286


### Regression

In [18]:
import pandas as pd 
data2 = pd.read_csv("../Data/house_price.csv", encoding="utf-8")
X = data2.iloc[:, 1:5]
y = data2[['house_value']]

from sklearn.model_selection import * 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 410)

from sklearn.preprocessing import * 
minmax = MinMaxScaler()
minmax.fit(X_train)
X_scaled_train = minmax.transform(X_train)
X_scaled_test = minmax.transform(X_test)

In [19]:
from sklearn.tree import *
model = DecisionTreeRegressor()
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

1.0

In [21]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.22011665472137054

In [22]:
import numpy as np 
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test,  pred_test)

RMSE_train = np.sqrt(MSE_train)
RMSE_test = np.sqrt(MSE_test)
print("학습데이터 RMSE : ", RMSE_train)
print("테스트 데이터 RMSE : ", RMSE_test)
# 과적합(Overfitting)

학습데이터 RMSE :  0.0
테스트 데이터 RMSE :  85878.96466492675


In [23]:
param_grid = {"max_depth" : range(2, 20, 2), "min_samples_leaf" : range(1, 50, 2) }
grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv = 5)
grid_search.fit(X_scaled_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': range(2, 20, 2),
                         'min_samples_leaf': range(1, 50, 2)})

In [24]:
print("Best Param : {}".format(grid_search.best_params_))
print("Best Score : {}".format(grid_search.best_score_))
print("Test Score : {}".format(grid_search.score(X_scaled_test, y_test)))

Best Param : {'max_depth': 10, 'min_samples_leaf': 49}
Best Score : 0.5589131314794625
Test Score : 0.5801638960031434


In [26]:
param_dist = {
    "max_depth" : randint(low = 1, high = 20),
    "min_samples_leaf" : randint(low = 1, high=50)}
randomSearch = RandomizedSearchCV(DecisionTreeRegressor(), param_dist, n_iter=1000, cv = 5)
randomSearch.fit(X_scaled_test, y_test)

RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_iter=1000,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1372116a0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x136f34550>})

In [27]:
print("Best param : {}".format(randomSearch.best_params_))
print("Best score : {}".format(randomSearch.best_score_))
print("Test score : {}".format(randomSearch.score(X_scaled_test, y_test)))

Best param : {'max_depth': 7, 'min_samples_leaf': 26}
Best score : 0.5564530257487512
Test score : 0.6313239240781642
