In [4]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
# DecisionTreeClassifier : 의사결정 트리
# train_test_split : 훈련 세트와 테스트 세트로 나누는 함수

In [8]:
import pandas as pd

iris = load_iris()

iris_data = iris.data

iris_label = iris.target
print(iris_label)
print(iris.target_names) # target_names : 결괏값

iris_df = pd.DataFrame(iris_data, columns=iris.feature_names)
iris_df['label'] = iris.target
iris_df.head(3) # head(3) : 3개만 출력

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
['setosa' 'versicolor' 'virginica']


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=11)
# test: 20%, train: 80%
# random_state : 난수 발생 seed 값

In [10]:
dt_clf = DecisionTreeClassifier(random_state=11)
dt_clf.fit(X_train, y_train) # fit : 훈련


In [11]:
pred = dt_clf.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, pred))

0.9333333333333333


In [14]:
iris_data = load_iris()
print(type(iris_data))

<class 'sklearn.utils._bunch.Bunch'>


In [15]:
keys = iris_data.keys()
print(keys)

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [16]:
df_clf = DecisionTreeClassifier()
iris_data = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=0.3, random_state=121)
df_clf.fit(X_train, y_train)
pred = df_clf.predict(X_test)
print(accuracy_score(y_test, pred))

0.9555555555555556


In [17]:
from sklearn.model_selection import KFold
import numpy as np

iris = load_iris()
features = iris.data
labels = iris.target
dt_clf = DecisionTreeClassifier(random_state=156)

kfold = KFold(n_splits=5)
cv_accuracy = []


In [18]:
n_iter = 0

for train_index, test_index in kfold.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    n_iter += 1

    accurary = np.round(accuracy_score(y_test, pred), 4)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]

    cv_accuracy.append(accurary)

print(np.mean(cv_accuracy))

0.9


In [19]:
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['label'] = iris.target
iris_df['label'].value_counts()

label
0    50
1    50
2    50
Name: count, dtype: int64

In [20]:
kfold = KFold(n_splits=3)
n_iter = 0
for train_index, test_index in kfold.split(iris_df):
    n_iter += 1
    label_train = iris_df['label'].iloc[train_index]
    label_test = iris_df['label'].iloc[test_index]

    print(label_train.value_counts())
    print(label_test.value_counts())

label
1    50
2    50
Name: count, dtype: int64
label
0    50
Name: count, dtype: int64
label
0    50
2    50
Name: count, dtype: int64
label
1    50
Name: count, dtype: int64
label
0    50
1    50
Name: count, dtype: int64
label
2    50
Name: count, dtype: int64


In [21]:
from sklearn.model_selection import StratifiedKFold

# 골고루 분배 후
skf = StratifiedKFold(n_splits=3)
n_iter = 0

for train_index, test_index in skf.split(iris_df, iris_df['label']):
    n_iter += 1
    label_train = iris_df['label'].iloc[train_index]
    label_test = iris_df['label'].iloc[test_index]

    print(label_train.value_counts())
    print(label_test.value_counts())

label
2    34
0    33
1    33
Name: count, dtype: int64
label
0    17
1    17
2    16
Name: count, dtype: int64
label
1    34
0    33
2    33
Name: count, dtype: int64
label
0    17
2    17
1    16
Name: count, dtype: int64
label
0    34
1    33
2    33
Name: count, dtype: int64
label
1    17
2    17
0    16
Name: count, dtype: int64


In [22]:
dt_clf = DecisionTreeClassifier(random_state=156)

skfold = StratifiedKFold(n_splits=3)
cv_accuracy = []

for train_index, test_index in skfold.split(features, labels):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    cv_accuracy.append(accuracy_score(y_test, pred))

print(np.mean(cv_accuracy))

0.9666666666666667


In [25]:
from sklearn.model_selection import cross_val_score

iris_data = load_iris()
dt_clf = DecisionTreeClassifier(random_state=156)

data = iris_data.data
target = iris_data.target

scores = cross_val_score(dt_clf, data, target, scoring='accuracy', cv=3)
print(scores)
print(np.mean(scores))

[0.98 0.94 0.98]
0.9666666666666667


In [28]:
# 하이퍼 파라미터 집합으로 튜닝
from sklearn.model_selection import GridSearchCV

iris_data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=0.2, random_state=121)

dtree = DecisionTreeClassifier()

parameters = {'max_depth':[1,2,3], 'min_samples_split':[2,3]}
grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=3, refit=True) # Hyper Parameter 튜닝
grid_dtree.fit(X_train, y_train) # 재학습

scores = pd.DataFrame(grid_dtree.cv_results_)
scores[['params', 'mean_test_score', 'rank_test_score','split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,1.0,0.95


In [29]:
print(max(scores['mean_test_score']))

0.975
