<a href="https://colab.research.google.com/github/sim-so/HandsOnML/blob/main/06_DecisionTree_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 06 Decision Tree 연습문제

In [1]:
import os

import numpy as np
import matplotlib.pyplot as plt

## 7. moons 데이터셋에 결정 트리를 훈련시키고 parameter tuning 하기

In [2]:
np.random.seed(33)
random_state=33

In [3]:
from sklearn.datasets import make_moons

Xm, ym = make_moons(n_samples=1000, noise=0.4)

In [4]:
from sklearn.model_selection import train_test_split

Xm_train, Xm_test, ym_train, ym_test = train_test_split(Xm, ym, test_size=0.2, random_state=random_state)

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

tree_clf = DecisionTreeClassifier(random_state=random_state)
params = {
    "max_leaf_nodes" : list(range(2, 30)),
    "min_samples_split" : [2, 3, 4],
    "max_depth" : [2, 3]
}
clf = GridSearchCV(tree_clf, param_grid=params, cv=4, verbose=1)
clf.fit(Xm_train, ym_train)

Fitting 4 folds for each of 168 candidates, totalling 672 fits


GridSearchCV(cv=4, estimator=DecisionTreeClassifier(random_state=33),
             param_grid={'max_depth': [2, 3],
                         'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14, 15, 16, 17, 18, 19, 20, 21,
                                            22, 23, 24, 25, 26, 27, 28, 29],
                         'min_samples_split': [2, 3, 4]},
             verbose=1)

In [6]:
clf.best_estimator_

DecisionTreeClassifier(max_depth=3, max_leaf_nodes=4, random_state=33)

In [7]:
clf.best_score_

0.8262499999999999

In [8]:
from sklearn.metrics import accuracy_score

In [9]:
ym_pred = clf.predict(Xm_test)
accuracy_score(ym_test, ym_pred)

0.89

## 8. 랜덤 포레스트 만들기

In [10]:
from sklearn.model_selection import ShuffleSplit
from sklearn.base import clone

from scipy.stats import mode

class RandomForestClassifier():                                 # following to scikit-learn RandomForestClassifier()
    def __init__(self, n_estimators, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None, boostrap=True, max_samples=None, random_state=None):
        self.n_estimators = n_estimators                        # num of trees
        self.max_samples = max_samples                          # num of samples for each tree
        self.random_state = random_state
        self.estimator_ = DecisionTreeClassifier(               # make a base estimator
            max_depth=max_depth, 
            min_samples_split=min_samples_split, 
            min_samples_leaf=min_samples_leaf, 
            max_leaf_nodes=max_leaf_nodes,
            random_state=random_state
            )
        print(self.estimator_)                                  # check parameters

    def fit(self, X, y):
        self.subsets_ = []
        ss = ShuffleSplit(n_splits=self.n_estimators, test_size=len(X)-self.max_samples, random_state=self.random_state)    
        for i_train, i_test in ss.split(X):                     # boostrap subamples
            X_sub_train = X[i_train]
            y_sub_train = y[i_train]
            self.subsets_.append((X_sub_train, y_sub_train))

        self.estimators_ = [clone(self.estimator_) for _ in range(self.n_estimators)]        
        for estimator, (X_sub_train, y_sub_train) in zip(self.estimators_, self.subsets_):
            estimator.fit(X_sub_train, y_sub_train)

    def predict(self, X):
        y_preds = []
        for estimator in self.estimators_:
            y_pred = estimator.predict(X)
            y_preds.append(y_pred)
        y_pred_majority, n_votes = mode(y_preds, axis=0)        # vote for majority
        return y_pred_majority.reshape([-1])                    # y_pred_majority.shape : (1, n_estimator) -> (n_estimator)

In [11]:
rf_clf = RandomForestClassifier(n_estimators=1000, max_samples=100, random_state=random_state, **clf.best_params_)

DecisionTreeClassifier(max_depth=3, max_leaf_nodes=4, random_state=33)


In [12]:
rf_clf.fit(Xm_train, ym_train)

In [13]:
ym_pred = rf_clf.predict(Xm_test)

In [14]:
accuracy_score(ym_test, ym_pred)

0.895