### Scikit-Learn 맛보기
##### Iris 분류 - 결정 트리(DecisionTree)

1. 데이터 탐색

In [103]:
from sklearn.datasets import load_iris
iris = load_iris()

In [104]:
import pandas as pd

In [105]:
# Data Access methods
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [106]:
# iris는 dictionary지만 아래와같이 통상 사용 (dict_name.key_name)
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [107]:
iris.feature_names, type(iris.data)

(['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'],
 numpy.ndarray)

In [108]:
iris.data[:3]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2]])

In [109]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [110]:
df['species'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [111]:
# y값의 분포
df.species.value_counts()

0    50
1    50
2    50
Name: species, dtype: int64

In [112]:
# dataset에 대한 설명
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [113]:
# 기초 통계 자료
df.groupby('species').describe()

Unnamed: 0_level_0,sepal length (cm),sepal length (cm),sepal length (cm),sepal length (cm),sepal length (cm),sepal length (cm),sepal length (cm),sepal length (cm),sepal width (cm),sepal width (cm),...,petal length (cm),petal length (cm),petal width (cm),petal width (cm),petal width (cm),petal width (cm),petal width (cm),petal width (cm),petal width (cm),petal width (cm)
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,50.0,5.006,0.35249,4.3,4.8,5.0,5.2,5.8,50.0,3.428,...,1.575,1.9,50.0,0.246,0.105386,0.1,0.2,0.2,0.3,0.6
1,50.0,5.936,0.516171,4.9,5.6,5.9,6.3,7.0,50.0,2.77,...,4.6,5.1,50.0,1.326,0.197753,1.0,1.2,1.3,1.5,1.8
2,50.0,6.588,0.63588,4.9,6.225,6.5,6.9,7.9,50.0,2.974,...,5.875,6.9,50.0,2.026,0.27465,1.4,1.8,2.0,2.3,2.5


2. 학습데이터와 테스트데이터 분리

In [114]:
from sklearn.model_selection import train_test_split
# 학습 : 테스트 =  3 : 1 (default)
# 대문자 X의 의미는 다차원(2차원), 소문자 y의 의미는 1차원
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((112, 4), (38, 4), (112,), (38,))

In [115]:
# y값의 분포
import numpy as np
np.unique(y_train, return_counts=True)

(array([0, 1, 2]), array([34, 37, 41], dtype=int64))

In [116]:
# y값의 분포를 균일하게 (37, 37, 38)의 꼴로
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, random_state=2023,
    stratify=iris.target            # y값의 분포를 균일하게 해줌
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((112, 4), (38, 4), (112,), (38,))

In [117]:
np.unique(y_train, return_counts=True)

(array([0, 1, 2]), array([37, 37, 38], dtype=int64))

In [118]:
# 학습데이터와 테스트 데이터의 비율(test_size)
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, random_state=2023,
    stratify=iris.target,
    test_size=0.2           # 학습 : 테스트 =  4 : 1로
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [119]:
np.unique(y_train, return_counts=True)

(array([0, 1, 2]), array([40, 40, 40], dtype=int64))

3. 학습 (훈련, Train)

In [120]:
del dtc

In [121]:
# 결정 트리 모델
from sklearn.tree import DecisionTreeClassifier

In [122]:
# 모델 생성, == 객체 생성
dtc = DecisionTreeClassifier(random_state=2023)
dtc

In [123]:
# 하이퍼 파라미터
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2023,
 'splitter': 'best'}

In [124]:
# 학습(훈련) 실행
dtc.fit(X_train, y_train)

4. 예측 (Prediction)

In [125]:
# 예측을 하는 경우에는 X값만 주고 y 값은 주지않음.
pred = dtc.predict(X_test)

In [126]:
res_df = pd.DataFrame({'y 실제값':y_test, 'y 예측값':pred})
res_df.head()

Unnamed: 0,y 실제값,y 예측값
0,2,2
1,0,0
2,1,1
3,1,1
4,0,0


5. 평가(Evaluation)

In [127]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9666666666666667

In [128]:
# 4), 5) 과정을 한번에
dtc.score(X_test, y_test)

0.9666666666666667